{
  "@context": {
    "@vocab": "http://schema.org/",
    "schema": "http://schema.org/"
  },
  "@type": "Article",
  "headline": "Meta-Harness: End-to-End Optimization of Model Harnesses",
  "author": [
    {
      "@type": "Person",
      "name": "Yoonho Lee",
      "affiliation": {
        "@type": "Organization",
        "name": "Stanford"
      }
    },
    {
      "@type": "Person",
      "name": "Roshen Nair",
      "affiliation": {
        "@type": "Organization",
        "name": "Stanford"
      }
    },
    {
      "@type": "Person",
      "name": "Qizheng Zhang",
      "affiliation": {
        "@type": "Organization",
        "name": "Stanford"
      }
    },
    {
      "@type": "Person",
      "name": "Kangwook Lee",
      "affiliation": {
        "@type": "Organization",
        "name": "KRAFTON"
      }
    },
    {
      "@type": "Person",
      "name": "Omar Khattab",
      "affiliation": {
        "@type": "Organization",
        "name": "MIT"
      }
    },
    {
      "@type": "Person",
      "name": "Chelsea Finn",
      "affiliation": {
        "@type": "Organization",
        "name": "Stanford"
      }
    }
  ],
  "publisher": {
    "@type": "Organization",
    "name": "arXiv",
    "url": "https://arxiv.org/abs/2603.28052"
  },
  "datePublished": "2026-03-30",
  "url": "https://yoonholee.com/meta-harness/",
  "mainEntityOfPage": "https://arxiv.org/abs/2603.28052",
  "description": "Meta-Harness is an outer-loop system that automates the optimization of model harnesses—code that manages information flow for large language models (LLMs). It uses a coding-agent proposer with filesystem access to prior code, execution traces, and scores, enabling richer feedback and targeted edits. Meta-Harness outperforms prior hand-designed harnesses and text optimizers on tasks including online text classification, retrieval-augmented math reasoning, and agentic coding on TerminalBench-2.",
  "articleBody": "Meta-Harness automates harness engineering by searching over executable harness code using a coding-agent proposer with full filesystem access to prior candidates' code, execution traces, and scores. This enables richer feedback and targeted edits beyond scalar or summary feedback. It outperforms prior methods on online text classification, retrieval-augmented math reasoning, and agentic coding benchmarks.",
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "name": "Defined Terms",
      "description": "Key terms defined in the article related to harness engineering and optimization.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "Harness",
          "description": "A stateful program that wraps a language model and determines what context the model sees at each step."
        },
        {
          "@type": "DefinedTerm",
          "name": "Harness Engineering",
          "description": "The practice of refining the code around an LLM to improve overall system performance."
        },
        {
          "@type": "DefinedTerm",
          "name": "Coding Agent",
          "description": "A language-model-based system that can invoke developer tools and modify code."
        },
        {
          "@type": "DefinedTerm",
          "name": "Filesystem Access",
          "description": "The ability of the proposer to read prior candidates' source code, execution traces, and scores stored in a filesystem."
        },
        {
          "@type": "DefinedTerm",
          "name": "Execution Trace",
          "description": "Logs of prompts, tool calls, model outputs, and state updates during harness evaluation."
        },
        {
          "@type": "DefinedTerm",
          "name": "Pareto Frontier",
          "description": "The set of candidate harnesses that are not dominated by others in terms of multiple objectives like accuracy and context cost."
        },
        {
          "@type": "DefinedTerm",
          "name": "Agentic Coding",
          "description": "Using LLMs as agents that can autonomously execute tasks with complex dependencies."
        },
        {
          "@type": "DefinedTerm",
          "name": "Retrieval-Augmented Reasoning",
          "description": "Enhancing LLM reasoning by retrieving relevant examples or knowledge from a large corpus."
        },
        {
          "@type": "DefinedTerm",
          "name": "Online Text Classification",
          "description": "A task where an LLM receives labeled examples sequentially, updates its memory, and is evaluated on held-out data."
        },
        {
          "@type": "DefinedTerm",
          "name": "TerminalBench-2",
          "description": "A benchmark of 89 challenging tasks requiring long-horizon autonomous execution under complex dependencies."
        }
      ]
    },
    {
      "@type": "Question",
      "name": "What is Meta-Harness?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Meta-Harness is an outer-loop system that automates the optimization of model harnesses by searching over executable harness code using a coding-agent proposer with full filesystem access to prior candidates' code, execution traces, and scores."
      }
    },
    {
      "@type": "Question",
      "name": "Why is harness engineering important?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Harness engineering is important because the harness—the code managing what information to store, retrieve, and present to the model—can affect performance as much as the model weights themselves."
      }
    },
    {
      "@type": "Question",
      "name": "How does Meta-Harness differ from prior text optimization methods?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Unlike prior methods that rely on compressed feedback such as scalar scores or summaries, Meta-Harness provides the proposer with full access to prior code, execution traces, and scores via a filesystem, enabling richer diagnosis and targeted edits."
      }
    },
    {
      "@type": "Question",
      "name": "What tasks was Meta-Harness evaluated on?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Meta-Harness was evaluated on online text classification, retrieval-augmented math reasoning, and agentic coding on TerminalBench-2."
      }
    },
    {
      "@type": "Question",
      "name": "What are the key components of the Meta-Harness search loop?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The search loop involves (1) a coding-agent proposer reading a filesystem of prior candidates' code, traces, and scores; (2) evaluating proposed harnesses on tasks; (3) storing all logs back to the filesystem; and repeating."
      }
    },
    {
      "@type": "Question",
      "name": "How does Meta-Harness perform on online text classification?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Meta-Harness improves accuracy by 7.7 points over the state-of-the-art Agentic Context Engineering (ACE) while using 4× fewer context tokens, and matches or surpasses prior text optimizers with fewer evaluations."
      }
    },
    {
      "@type": "Question",
      "name": "What is the advantage of Meta-Harness's full filesystem access?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Full filesystem access allows the proposer to selectively inspect raw prior code and execution traces, enabling causal reasoning over failures and more effective harness modifications."
      }
    },
    {
      "@type": "Question",
      "name": "How does Meta-Harness improve retrieval-augmented math reasoning?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Meta-Harness discovers a retrieval harness that improves accuracy by 4.7 points on 200 IMO-level problems across five held-out models, outperforming no retrieval and strong baselines."
      }
    },
    {
      "@type": "Question",
      "name": "What results does Meta-Harness achieve on TerminalBench-2?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Meta-Harness discovers harnesses that surpass the hand-engineered Terminus-KIRA baseline, achieving 76.4% pass rate on Claude Opus 4.6 and ranking #1 among Haiku 4.5 agents."
      }
    },
    {
      "@type": "Question",
      "name": "What practical tips are recommended for implementing Meta-Harness?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Tips include writing a good skill to guide the proposer, starting with a baseline harness and a challenging search set, logging all data in queryable formats, lightweight validation before expensive evaluations, and automating evaluation outside the proposer."
      }
    },
    {
      "@type": "HowTo",
      "name": "How to implement Meta-Harness in a new domain",
      "description": "Steps to apply Meta-Harness for harness optimization in a new domain.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Write a good skill",
          "description": "Create a natural-language skill that defines the proposer's role, directory layout, CLI commands, and output format, constraining outputs and safety behavior but allowing free diagnosis."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Start with a baseline harness and search set",
          "description": "Develop a simple baseline harness and select a search set of challenging examples where the baseline performs poorly."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Log everything in a queryable format",
          "description": "Ensure evaluation code writes code, scores, and execution traces in machine-readable formats like JSON, organized hierarchically for easy querying."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to run the Meta-Harness search loop",
      "description": "Procedure for iterative harness optimization using Meta-Harness.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Initialize population and filesystem",
          "description": "Start with an initial set of valid harnesses and an empty filesystem to store code, scores, and traces."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Evaluate initial harnesses",
          "description": "Evaluate each harness on the task distribution and store results in the filesystem."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Iterative proposal and evaluation",
          "description": "At each iteration, the proposer queries the filesystem, proposes new harnesses, validates them, evaluates on tasks, and logs all data back to the filesystem."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to interpret proposer behavior in Meta-Harness",
      "description": "Understanding how the coding-agent proposer uses filesystem access to improve harnesses.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Inspect prior code and execution traces",
          "description": "The proposer reads many files per iteration, including prior harness source code and execution traces."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Form causal hypotheses",
          "description": "Based on observed regressions and successes, the proposer hypothesizes causes of failures."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Test and pivot",
          "description": "The proposer tests hypotheses by isolating changes and pivots to safer or additive modifications based on results."
        }
      ]
    },
    {
      "@type": "ImageObject",
      "name": "Meta-Harness Search Progress and TerminalBench-2 Harness Performance",
      "description": "Figure 1 on page 1 shows Meta-Harness outperforming prior hand-designed harnesses and text optimizers on text classification and TerminalBench-2.",
      "contentUrl": "page1_image1.png",
      "thumbnailUrl": "page1_image1_thumb.png"
    },
    {
      "@type": "ImageObject",
      "name": "Meta-Harness Search Loop Diagram",
      "description": "Figure 2 on page 2 illustrates the Meta-Harness search loop involving proposing harness code, evaluating on tasks, and storing logs.",
      "contentUrl": "page2_image1.png",
      "thumbnailUrl": "page2_image1_thumb.png"
    },
    {
      "@type": "ImageObject",
      "name": "Draft-Verification Classification Harness Diagram",
      "description": "Figure 5 on page 18 shows the two-stage draft-verification classification harness discovered by Meta-Harness for online text classification.",
      "contentUrl": "page18_image1.png",
      "thumbnailUrl": "page18_image1_thumb.png"
    },
    {
      "@type": "ImageObject",
      "name": "Label-Primed Query-Anchored Classification Harness Diagram",
      "description": "Figure 6 on page 19 depicts the label-primed query-anchored classification harness with label primer, coverage block, and contrastive pairs.",
      "contentUrl": "page19_image1.png",
      "thumbnailUrl": "page19_image1_thumb.png"
    },
    {
      "@type": "ImageObject",
      "name": "Discovered Math Retrieval Harness Flowchart",
      "description": "Figure 8 on page 20 shows the math retrieval harness routing queries to combinatorics, geometry, number theory, or algebra/other routes with BM25 retrieval and reranking.",
      "contentUrl": "page20_image1.png",
      "thumbnailUrl": "page20_image1_thumb.png"
    },
    {
      "@type": "ImageObject",
      "name": "Discovered TerminalBench-2 Harness Diagram",
      "description": "Figure 9 on page 21 summarizes the TerminalBench-2 harness with environment bootstrapping added to Terminus-KIRA.",
      "contentUrl": "page21_image1.png",
      "thumbnailUrl": "page21_image1_thumb.png"
    }
  ]
}