{
  "@context": {
    "@vocab": "http://schema.org/",
    "schema": "http://schema.org/",
    "ex": "https://example.org/vocab#"
  },
  "@type": "ScholarlyArticle",
  "headline": "Read the Paper, Write the Code: Agentic Reproduction of Social-Science Results",
  "author": [
    {
      "@type": "Person",
      "name": "Benjamin Kohler",
      "affiliation": {
        "@type": "Organization",
        "name": "ETH Zurich"
      },
      "email": "benjamin.kohler@gess.ethz.ch"
    },
    {
      "@type": "Person",
      "name": "David Zollikofer",
      "affiliation": {
        "@type": "Organization",
        "name": "ETH Zurich"
      },
      "email": "david.zollikofer@gess.ethz.ch"
    },
    {
      "@type": "Person",
      "name": "Johanna Einsiedler",
      "affiliation": {
        "@type": "Organization",
        "name": "University of Basel"
      },
      "email": "johanna.einsiedler@unibas.ch"
    },
    {
      "@type": "Person",
      "name": "Alexander Hoyle",
      "affiliation": {
        "@type": "Organization",
        "name": "ETH Zurich"
      },
      "email": "alexander.hoyle@ai.ethz.ch",
      "annotation": {
        "schema:roleName": "Co-supervisor"
      }
    },
    {
      "@type": "Person",
      "name": "Elliott Ash",
      "affiliation": {
        "@type": "Organization",
        "name": "ETH Zurich"
      },
      "email": "ashe@ethz.ch",
      "annotation": {
        "schema:roleName": "Co-supervisor"
      }
    }
  ],
  "datePublished": "2026-04-23",
  "abstract": "This paper develops an agentic system that reproduces social science results from paper methods and data alone, without access to original code. Evaluating 48 papers, agents recover published results with varying success, revealing that failures stem from both agent errors and underspecification in papers.",
  "mainEntityOfPage": "https://arxiv.org/abs/2604.21965v1",
  "publisher": {
    "@type": "Organization",
    "name": "arXiv",
    "url": "https://arxiv.org"
  },
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "name": "Glossary of Terms in Agentic Reproducibility",
      "description": "Key terms defined for understanding agentic reproducibility in social science research.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "Agentic System",
          "description": "Autonomous systems capable of generating, debugging, and executing end-to-end research pipelines."
        },
        {
          "@type": "DefinedTerm",
          "name": "Reproducibility",
          "description": "Testing whether results and conclusions of original studies can be reproduced based on the same data used in the original studies."
        },
        {
          "@type": "DefinedTerm",
          "name": "Replicability",
          "description": "Validity of results when similar methods are applied to new data."
        },
        {
          "@type": "DefinedTerm",
          "name": "Re-implementation",
          "description": "Reproducing results using only the information reported in the paper and the same data, without access to original code."
        },
        {
          "@type": "DefinedTerm",
          "name": "Agent Error",
          "description": "Errors caused by the agent misunderstanding or not complying with the methods description."
        },
        {
          "@type": "DefinedTerm",
          "name": "Human Error",
          "description": "Errors caused by underspecification or missing data in the original paper or replication package."
        },
        {
          "@type": "DefinedTerm",
          "name": "Guardrail Audit",
          "description": "A two-stage audit pipeline to verify agent compliance with information isolation restrictions."
        },
        {
          "@type": "DefinedTerm",
          "name": "Hardcoding Audit",
          "description": "Audit to detect if agents output statistical results as numeric literals without computation from data."
        },
        {
          "@type": "DefinedTerm",
          "name": "Deterministic Evaluation",
          "description": "Evaluation approach comparing reproduced outputs directly to original values with adjustments for statistical significance."
        },
        {
          "@type": "DefinedTerm",
          "name": "Agent Scaffold",
          "description": "Framework or environment in which an LLM agent operates to perform reproduction tasks."
        }
      ]
    },
    {
      "@type": "Question",
      "name": "Can AI agents reproduce social science results without access to original code?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Yes, agents can largely recover published results given only the paper's methods description and original data, but performance varies by model, scaffold, and paper."
      }
    },
    {
      "@type": "Question",
      "name": "What are the main sources of reproduction failures?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Failures stem from both agent errors and underspecification or missing information in the papers themselves."
      }
    },
    {
      "@type": "Question",
      "name": "How is agentic reproduction evaluated in this study?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "By extracting structured methods and results templates from papers, having agents re-implement code in isolation, and comparing reproduced outputs deterministically to original results."
      }
    },
    {
      "@type": "Question",
      "name": "What is the role of the guardrail audit?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "To verify that agents do not access forbidden files or information outside their isolated workspace, ensuring genuine reimplementation."
      }
    },
    {
      "@type": "Question",
      "name": "What is the significance of the hardcoding audit?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "It detects whether agents produce results by hardcoding numeric literals rather than computing them from data."
      }
    },
    {
      "@type": "Question",
      "name": "How stable are agentic reproduction results across multiple runs?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Results are generally stable at the table level, with over 80% of tables showing at most one grade difference across runs, though coefficient-level variation is higher."
      }
    },
    {
      "@type": "Question",
      "name": "Does pre-training data leakage affect agentic reproduction performance?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "No statistically significant difference was found between papers published before and after the model knowledge cutoff, suggesting leakage is not a primary driver."
      }
    },
    {
      "@type": "Question",
      "name": "What is the main bottleneck for automated reproducibility according to the study?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The primary bottleneck is the underspecification or omission of key implementation details in social science papers, not model capability alone."
      }
    },
    {
      "@type": "Question",
      "name": "What future directions does the paper suggest for agentic reproducibility?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Extending beyond reproduction with shared data to replication with new data, inferring methods from research questions, and refining analyses autonomously."
      }
    }
  ],
  "hasPart": [
    {
      "@type": "HowTo",
      "name": "Agentic Reproduction Pipeline",
      "description": "Steps for reproducing social science results using LLM agents without original code.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Extraction",
          "text": "Extract structured methods, data descriptions, and blinded results tables from the paper and replication package using LLMs. Mask numerical results to prevent leakage."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Reimplementation",
          "text": "Agents write and execute code scripts for each table based on extracted methods and data, operating in isolated environments without access to original code or results."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Evaluation",
          "text": "Compare reproduced outputs to original results at the cell level using deterministic metrics and assign letter grades based on deviation and sign agreement."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Explanation",
          "text": "Use LLM auditors to diagnose discrepancies by tracing errors to agent mistakes, extraction faults, missing data, or underspecification in the original paper."
        }
      ]
    }
  ],
  "image": {
    "@type": "ImageObject",
    "contentUrl": "https://cdn.arxiv.org/pdf/2604.21965v1-page-4.png",
    "caption": "Figure 1: Overview of the pipeline for replicating empirical results. Extraction, Reimplementation, and Evaluation steps are shown with example inputs and outputs."
  },
  "articleBody": "This paper develops an agentic system that reproduces social science results from paper methods and data alone, without access to original code. Evaluating 48 papers, agents recover published results with varying success, revealing that failures stem from both agent errors and underspecification in papers. The pipeline involves extracting structured methods and results templates, reimplementing code in isolation, evaluating outputs deterministically, and diagnosing discrepancies. The best agent achieves over 90% sign agreement and 80% of coefficients within 95% confidence intervals. Errors arise mainly from underspecified methods in papers and missing data, with agent errors being secondary. Stability across runs is high, and no evidence of pre-training leakage was found. The study suggests that improving method documentation is key to advancing automated reproducibility."
}