{
  "@context": {
    "@vocab": "http://schema.org/",
    "schema": "http://schema.org/",
    "dcterms": "http://purl.org/dc/terms/",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "hasPart": {
      "@id": "schema:hasPart",
      "@type": "@id"
    },
    "mainEntity": {
      "@id": "schema:mainEntity",
      "@type": "@id"
    },
    "acceptedAnswer": {
      "@id": "schema:acceptedAnswer",
      "@type": "@id"
    },
    "suggestedAnswer": {
      "@id": "schema:suggestedAnswer",
      "@type": "@id"
    },
    "author": {
      "@id": "schema:author",
      "@type": "@id"
    },
    "publisher": {
      "@id": "schema:publisher",
      "@type": "@id"
    },
    "creator": {
      "@id": "schema:creator",
      "@type": "@id"
    },
    "HowToStep": "schema:HowToStep",
    "Question": "schema:Question",
    "Answer": "schema:Answer",
    "DefinedTerm": "schema:DefinedTerm",
    "DefinedTermSet": "schema:DefinedTermSet",
    "HowTo": "schema:HowTo"
  },
  "@type": "Article",
  "@id": "info17040389",
  "mainEntityOfPage": "https://doi.org/10.3390/info17040389",
  "headline": "An Empirical Study of Knowledge Graph-Enhanced RAG for Information Security Compliance",
  "author": [
    {
      "@type": "Person",
      "name": "Dimitar Jovanovski",
      "email": "dimitar.jovanovski.1@students.finki.ukim.mk",
      "affiliation": {
        "@type": "Organization",
        "name": "Faculty of Computer Science and Engineering, Ss. Cyril and Methodius University",
        "address": {
          "@type": "PostalAddress",
          "addressLocality": "Skopje",
          "addressCountry": "North Macedonia"
        }
      }
    },
    {
      "@type": "Person",
      "name": "Marija Stojcheva",
      "email": "marija.stojcheva@finki.ukim.mk",
      "affiliation": {
        "@id": "#finki"
      }
    },
    {
      "@type": "Person",
      "name": "Mila Dodevska",
      "email": "mila.dodevska@finki.ukim.mk",
      "affiliation": {
        "@id": "#finki"
      }
    },
    {
      "@type": "Person",
      "name": "Petre Lameski",
      "email": "petre.lameski@finki.ukim.mk",
      "affiliation": {
        "@id": "#finki"
      }
    },
    {
      "@type": "Person",
      "name": "Igor Mishkovski",
      "email": "igor.mishkovski@finki.ukim.mk",
      "affiliation": {
        "@id": "#finki"
      }
    },
    {
      "@type": "Person",
      "name": "Dejan Gjorgjevikj",
      "email": "dejan.gjorgjevikj@finki.ukim.mk",
      "affiliation": {
        "@id": "#finki"
      }
    }
  ],
  "publisher": {
    "@type": "Organization",
    "name": "MDPI",
    "location": {
      "@type": "Place",
      "address": {
        "@type": "PostalAddress",
        "addressLocality": "Basel",
        "addressCountry": "Switzerland"
      }
    }
  },
  "datePublished": "2026-04-20",
  "dateModified": "2026-04-05",
  "dateReceived": "2026-02-14",
  "dateAccepted": "2026-04-05",
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "copyrightHolder": "The authors",
  "abstract": "This study introduces a privacy-preserving retrieval-augmented generation (RAG) framework integrating LightRAG, a knowledge graph-based retrieval system, with locally hosted open-source language models for ISO/IEC 27000 information security compliance. It constructs a semantic knowledge graph modeling relationships between clauses, enabling more accurate retrieval than chunk-based methods. A curated benchmark of 222 multiple-choice questions from official ISO standards and academic sources was developed. Results show knowledge graph retrieval outperforms chunk-based RAG and non-retrieval LLMs, with embedding quality strongly influencing performance. The best setup achieves 90.54% accuracy, demonstrating promise for graph-structured retrieval in regulatory QA.",
  "keywords": [
    "retrieval-augmented generation",
    "knowledge graph",
    "information security",
    "ISO/IEC 27000 standards",
    "compliance",
    "open-source models"
  ],
  "articleBody": "The article presents a novel RAG system using knowledge graph-enhanced retrieval for ISO/IEC 27000 standards compliance question answering. It constructs a semantic graph from clause-level chunks, preserving hierarchical and cross-referential relationships. The system uses local open-source LLMs for answer generation, ensuring privacy. A benchmark of 222 multiple-choice questions was created from official standards and training materials. Experiments show that embedding model quality and hybrid retrieval modes combining local and global graph traversal yield the best accuracy. The best configuration achieves 90.54% accuracy, outperforming chunk-based retrieval and standalone LLMs. Limitations include the benchmark's multiple-choice format and computational overhead of graph construction. Future work includes adaptive chunking, domain adaptation, and broader evaluation.",
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "@id": "iso27000-terms",
      "name": "Defined Terms in ISO/IEC 27000 Standards",
      "description": "Key terminology and concepts from the ISO/IEC 27000 family of standards relevant to information security management.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "Information Security Management System (ISMS)",
          "description": "A systematic approach to managing sensitive company information to remain secure."
        },
        {
          "@type": "DefinedTerm",
          "name": "Clause",
          "description": "A distinct section or provision within an ISO/IEC standard document."
        },
        {
          "@type": "DefinedTerm",
          "name": "Cross-reference",
          "description": "Explicit references linking clauses or sections across documents."
        },
        {
          "@type": "DefinedTerm",
          "name": "Embedding Model",
          "description": "A model that converts text chunks into vector representations for retrieval."
        },
        {
          "@type": "DefinedTerm",
          "name": "Retrieval-Augmented Generation (RAG)",
          "description": "A method combining retrieval of relevant documents with language model generation."
        },
        {
          "@type": "DefinedTerm",
          "name": "Knowledge Graph",
          "description": "A graph structure representing entities and their relationships extracted from documents."
        },
        {
          "@type": "DefinedTerm",
          "name": "Hybrid Retrieval Mode",
          "description": "A retrieval strategy combining local and global graph traversal for evidence gathering."
        },
        {
          "@type": "DefinedTerm",
          "name": "Chunk",
          "description": "A segment of text, typically a clause or paragraph, used as a retrieval unit."
        },
        {
          "@type": "DefinedTerm",
          "name": "Context Window",
          "description": "The maximum number of tokens available to the language model during answer generation."
        },
        {
          "@type": "DefinedTerm",
          "name": "Embedding Chunk Size",
          "description": "The maximum token length allowed per chunk before embedding."
        }
      ]
    },
    {
      "@type": "Question",
      "@id": "q1",
      "name": "Does knowledge graph-enhanced retrieval achieve higher multiple-choice answer accuracy than naïve retrieval?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Yes, knowledge graph-enhanced retrieval generally achieves higher accuracy, with improvements of approximately 3–10 percentage points over naïve chunk-based retrieval."
      }
    },
    {
      "@type": "Question",
      "@id": "q2",
      "name": "How does embedding model choice affect system performance?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Embedding model quality strongly influences performance, more so than the size of the reasoning language model. Higher-quality embeddings yield better retrieval and accuracy."
      }
    },
    {
      "@type": "Question",
      "@id": "q3",
      "name": "What retrieval modes were evaluated and which performed best?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Naïve, local, global, and hybrid retrieval modes were evaluated. Hybrid retrieval, combining local and global graph traversal, generally performed best."
      }
    },
    {
      "@type": "Question",
      "@id": "q4",
      "name": "What system parameters affect performance in the RAG pipeline?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Chunk size (MAX_EMBED_TOKENS) and context window size (NUM_CTX) significantly affect performance. Smaller or medium chunk sizes with larger context windows yield better accuracy."
      }
    },
    {
      "@type": "Question",
      "@id": "q5",
      "name": "How does the system ensure privacy?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "All components, including language and embedding models, run fully locally without external cloud services, ensuring data confidentiality."
      }
    },
    {
      "@type": "Question",
      "@id": "q6",
      "name": "What is the scope of the benchmark dataset?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The benchmark contains 222 multiple-choice questions derived from seven core ISO/IEC 27000 standards, certification prep materials, academic and industry sources."
      }
    },
    {
      "@type": "Question",
      "@id": "q7",
      "name": "What are the limitations of the benchmark and study?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Limitations include the multiple-choice format simplifying reasoning, lack of expert validation, no evaluation of latency or scalability, and limited generalization beyond the selected standards."
      }
    },
    {
      "@type": "Question",
      "@id": "q8",
      "name": "How does the system handle retrieval of cross-referential and hierarchical information?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "By constructing a semantic knowledge graph with typed edges representing cross-references, semantic similarity, and hierarchical dependencies, enabling multi-hop and context-aware retrieval."
      }
    },
    {
      "@type": "Question",
      "@id": "q9",
      "name": "How does the system compare to standalone large language models?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The knowledge graph-enhanced RAG system significantly outperforms standalone LLMs without retrieval, indicating the importance of external structured evidence."
      }
    },
    {
      "@type": "Question",
      "@id": "q10",
      "name": "What are common error types identified in system predictions?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Errors include retrieval misses, incomplete multi-hop aggregation, distractor susceptibility, and graph construction noise."
      }
    },
    {
      "@type": "HowTo",
      "@id": "howto1",
      "name": "Constructing a Knowledge Graph for Regulatory Documents",
      "description": "Steps to build a semantic knowledge graph from ISO/IEC 27000 documents.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Segment Documents",
          "text": "Divide source documents into clause- or paragraph-level chunks."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Extract Entities and Relations",
          "text": "Use a language model to extract entities, concepts, and typed relationships from each chunk."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Assemble Knowledge Graph",
          "text": "Combine extracted elements into a corpus-level knowledge graph with metadata linking back to source chunks."
        }
      ]
    },
    {
      "@type": "HowTo",
      "@id": "howto2",
      "name": "Performing Retrieval Using LightRAG",
      "description": "How to retrieve relevant evidence using different retrieval modes.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Naïve Retrieval",
          "text": "Retrieve chunks based solely on vector similarity over embeddings."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Local Retrieval",
          "text": "Focus retrieval on graph elements closely associated with entities and relations relevant to the query."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Global Retrieval",
          "text": "Expand retrieval over broader graph neighborhoods to collect distributed context."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Hybrid Retrieval",
          "text": "Combine local and global retrieval evidence for comprehensive context."
        }
      ]
    },
    {
      "@type": "HowTo",
      "@id": "howto3",
      "name": "Generating Grounded Answers",
      "description": "Using retrieved evidence to generate factual answers.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Retrieve Evidence",
          "text": "Collect relevant ISO-related context using LightRAG retrieval."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Condition Language Model",
          "text": "Provide retrieved evidence and user query to a locally hosted language model."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Generate Answer",
          "text": "Generate a final answer grounded in the retrieved context to reduce hallucinations."
        }
      ]
    }
  ],
  "author": [
    {
      "@type": "Person",
      "name": "Dimitar Jovanovski"
    },
    {
      "@type": "Person",
      "name": "Marija Stojcheva"
    },
    {
      "@type": "Person",
      "name": "Mila Dodevska"
    },
    {
      "@type": "Person",
      "name": "Petre Lameski"
    },
    {
      "@type": "Person",
      "name": "Igor Mishkovski"
    },
    {
      "@type": "Person",
      "name": "Dejan Gjorgjevikj"
    }
  ],
  "publisher": {
    "@type": "Organization",
    "name": "MDPI",
    "location": {
      "@type": "Place",
      "address": {
        "@type": "PostalAddress",
        "addressLocality": "Basel",
        "addressCountry": "Switzerland"
      }
    }
  },
  "funding": {
    "@type": "Grant",
    "funder": {
      "@type": "Organization",
      "name": "Ministry of Education and Science of the Republic of North Macedonia"
    },
    "description": "Project 'Utilising AI and National Large Language Models to Advance Macedonian Language Capabilities'"
  },
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "citation": [
    {
      "@type": "CreativeWork",
      "name": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
      "author": "Lewis et al.",
      "datePublished": "2020",
      "url": "https://doi.org/10.5555/3454287.3454856"
    },
    {
      "@type": "CreativeWork",
      "name": "LightRAG: Simple and fast retrieval-augmented generation",
      "author": "Guo et al.",
      "datePublished": "2024",
      "url": "https://arxiv.org/abs/2410.05779"
    },
    {
      "@type": "CreativeWork",
      "name": "ISO/IEC 27000:2018 Information Security Management Systems—Overview and Vocabulary",
      "publisher": "ISO",
      "datePublished": "2018",
      "url": "https://www.iso.org/standard/73906.html"
    },
    {
      "@type": "CreativeWork",
      "name": "ISO/IEC 27001:2022 Information Security Management Systems—Requirements",
      "publisher": "ISO",
      "datePublished": "2022",
      "url": "https://www.iso.org/standard/82875.html"
    }
  ]
}