{
  "@context": {
    "@vocab": "http://schema.org/",
    "shacl": "https://www.w3.org/ns/shacl#",
    "xsh": "http://example.org/xpshacl#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "schema": "http://schema.org/",
    "ex": "http://example.org/",
    "ViolationSignature": "xsh:ViolationSignature",
    "Explanation": "xsh:Explanation",
    "ConstraintViolation": "xsh:ConstraintViolation",
    "JustificationTree": "xsh:JustificationTree",
    "DomainContext": "xsh:DomainContext",
    "ViolationKG": "xsh:ViolationKnowledgeGraph",
    "ExplanationGenerator": "xsh:ExplanationGenerator",
    "ExtendedSHACLValidator": "xsh:ExtendedSHACLValidator",
    "JustificationTreeBuilder": "xsh:JustificationTreeBuilder",
    "ContextRetriever": "xsh:ContextRetriever",
    "LocalExplanationGenerator": "xsh:LocalExplanationGenerator"
  },
  "@type": "ScholarlyArticle",
  "headline": "xpSHACL: Explainable SHACL Validation using Retrieval-Augmented Generation and Large Language Models",
  "author": [
    {
      "@type": "Person",
      "name": "Gustavo Correa Publio",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Leipzig",
        "address": {
          "@type": "PostalAddress",
          "addressCountry": "Germany"
        }
      },
      "email": "gustavo.publio@informatik.uni-leipzig.de"
    },
    {
      "@type": "Person",
      "name": "José Emilio Labra Gayo",
      "affiliation": {
        "@type": "CollegeOrUniversity",
        "name": "University of Oviedo",
        "address": {
          "@type": "PostalAddress",
          "addressCountry": "Spain"
        }
      },
      "email": "labra@uniovi.es"
    }
  ],
  "datePublished": "2025-07-11",
  "abstract": "Shapes Constraint Language (SHACL) is a powerful language for validating RDF data. Given the recent industry attention to Knowledge Graphs (KGs), more users need to validate linked data properly. However, traditional SHACL validation engines often provide terse reports in English that are difficult for non-technical users to interpret and act upon. This paper presents xpSHACL, an explainable SHACL validation system that addresses this issue by combining rule-based justification trees with retrieval-augmented generation (RAG) and large language models (LLMs) to produce detailed, multi-language, human-readable explanations for constraint violations. A key feature of xpSHACL is its usage of a Violation KG to cache and reuse explanations, improving efficiency and consistency.",
  "articleBody": "xpSHACL combines rule-based justification trees, retrieval-augmented generation, and LLMs to generate human-readable explanations for SHACL violations, cached in a Violation KG for efficiency.",
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "name": "Defined Terms in xpSHACL",
      "description": "Key terms defined in the xpSHACL system and SHACL validation context.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "SHACL",
          "description": "Shapes Constraint Language, a W3C Recommendation for validating RDF graphs against shapes."
        },
        {
          "@type": "DefinedTerm",
          "name": "Justification Tree",
          "description": "A structured logical explanation tracing the reasoning steps leading to a SHACL violation."
        },
        {
          "@type": "DefinedTerm",
          "name": "Retrieval-Augmented Generation (RAG)",
          "description": "Technique combining retrieval of relevant knowledge with language model generation to produce contextually rich outputs."
        },
        {
          "@type": "DefinedTerm",
          "name": "Large Language Model (LLM)",
          "description": "A deep learning model capable of generating human-like natural language text."
        },
        {
          "@type": "DefinedTerm",
          "name": "Violation Knowledge Graph (Violation KG)",
          "description": "A persistent knowledge base caching violation signatures and their explanations for reuse."
        },
        {
          "@type": "DefinedTerm",
          "name": "Constraint Violation",
          "description": "An instance where RDF data fails to meet a SHACL constraint."
        },
        {
          "@type": "DefinedTerm",
          "name": "Ontology Fragment",
          "description": "Relevant triples from the RDF data graph related to the focus node of a violation."
        },
        {
          "@type": "DefinedTerm",
          "name": "Shape Documentation",
          "description": "Human-readable comments or descriptions associated with SHACL shapes."
        },
        {
          "@type": "DefinedTerm",
          "name": "Domain Rules",
          "description": "Domain-specific rules linked to properties or constraints providing additional context."
        },
        {
          "@type": "DefinedTerm",
          "name": "Violation Signature",
          "description": "A unique identifier abstracting key characteristics of a violation for caching and retrieval."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to Generate Explainable SHACL Validation Reports with xpSHACL",
      "description": "Stepwise process to produce human-readable explanations for SHACL constraint violations using xpSHACL.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Perform Extended SHACL Validation",
          "text": "Use the Extended SHACL Validator to validate RDF data against SHACL shapes, capturing detailed violation information including focus node, violated constraint, and violating value."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Build Justification Tree",
          "text": "Construct a logical justification tree representing the reasoning steps and SHACL rules that led to each violation."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Retrieve Contextual Information",
          "text": "Retrieve ontology fragments, shape documentation, similar violation cases, and domain rules relevant to the violation using SPARQL queries."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Generate Violation Signature",
          "text": "Create a unique signature for the violation based on shape, constraint, property path, and violation type."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Check Violation Knowledge Graph",
          "text": "Query the Violation KG to check if an explanation for the violation signature already exists."
        },
        {
          "@type": "HowToStep",
          "position": 6,
          "name": "Generate Explanation Using LLM",
          "text": "If no cached explanation exists, use the Explanation Generator with LLMs to synthesize a natural language explanation and correction suggestions."
        },
        {
          "@type": "HowToStep",
          "position": 7,
          "name": "Cache Explanation in Violation KG",
          "text": "Store the newly generated explanation and suggestions in the Violation KG associated with the violation signature for future reuse."
        },
        {
          "@type": "HowToStep",
          "position": 8,
          "name": "Output Explanation",
          "text": "Provide the user with a clear, multi-language explanation and actionable suggestions for the SHACL violation."
        }
      ]
    },
    {
      "@type": "Question",
      "name": "What is the main purpose of xpSHACL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "To provide understandable, human-readable explanations for SHACL constraint violations in RDF data by combining rule-based justification trees, retrieval-augmented generation, and large language models."
      }
    },
    {
      "@type": "Question",
      "name": "How does xpSHACL improve explanation efficiency?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "By using a Violation Knowledge Graph to cache and reuse explanations for recurring violation signatures, reducing redundant LLM calls and improving consistency."
      }
    },
    {
      "@type": "Question",
      "name": "What role does the Justification Tree play in xpSHACL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "It provides a verifiable, logical trace of why a violation occurred according to SHACL rules, forming the factual basis for explanations."
      }
    },
    {
      "@type": "Question",
      "name": "What types of context does xpSHACL retrieve to enrich explanations?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Ontology fragments about the focus node, shape documentation, similar violation cases, and domain-specific rules."
      }
    },
    {
      "@type": "Question",
      "name": "How does xpSHACL handle multi-language explanations?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The Explanation Generator uses LLMs capable of generating explanations and suggestions in multiple specified languages."
      }
    },
    {
      "@type": "Question",
      "name": "What is a Violation Signature in xpSHACL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "A unique identifier abstracting key characteristics of a violation, such as the shape, constraint, property path, and violation type, used for caching and retrieval in the Violation KG."
      }
    },
    {
      "@type": "Question",
      "name": "What libraries does xpSHACL use for implementation?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Key libraries include rdflib for RDF processing, pyshacl for SHACL validation, openai and ollama for LLM interaction, python-dotenv for environment management, and Faker for synthetic data generation."
      }
    },
    {
      "@type": "Question",
      "name": "How does xpSHACL ensure explanation consistency?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "By caching explanations in the Violation KG and retrieving them for recurring violations, ensuring consistent explanations despite LLM non-determinism."
      }
    },
    {
      "@type": "Question",
      "name": "What are the main evaluation metrics for xpSHACL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Explanation quality, efficiency (runtime and cache hit rate), consistency of explanations, robustness in processing real-world datasets, and user satisfaction."
      }
    },
    {
      "@type": "Question",
      "name": "What future work is planned for xpSHACL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Includes formal user studies, improving context retrieval and explanation generation, collaborative Violation KG development, integration with SHACL tools, user feedback mechanisms, and scalability improvements."
      }
    }
  ],
  "image": {
    "@type": "ImageObject",
    "contentUrl": "page1_image.png",
    "description": "Diagram of xpSHACL process overview showing input RDF data and SHACL shapes feeding into Extended SHACL Validator, Justification Tree Builder, Context Retriever, Explanation Generator/LLM, and Violation KG, producing success reports or explained violations."
  },
  "publisher": {
    "@type": "Organization",
    "name": "arXiv",
    "url": "https://arxiv.org/abs/2507.08432"
  }
}