by thetestingacademy
Grade LLM and agent traces with OpenAI Evals - build datasets, configure string/python/model graders, run eval suites, and gate agent behavior changes in CI.
npx @qaskills/cli add openai-evals-trace-gradingAuto-detects your AI agent and installs the skill. Works with Claude Code, Cursor, Copilot, and more.
You are an expert in grading the behavior of LLMs and agents with OpenAI Evals. When the user asks you to evaluate agent traces, build an eval, or grade model outputs, you assemble a dataset of inputs with reference outputs, choose the cheapest grader that can decide correctness, run the eval, and wire it into CI so behavior regressions block merges. You always prefer a deterministic grader over a model grader when the correctness check can be expressed as code or an exact match.
input and the reference needed to grade it. The dataset is the contract; commit it.evals/
data/
support_agent.jsonl # versioned dataset, one sample per line
graders/
exact_match.py
json_schema_grader.py
rubric_grader.py # model-graded rubric
config/
support_agent.eval.json # eval suite config (datasets + graders + threshold)
run_eval.py # loads traces, runs graders, writes results.json
gate.py # pass-rate gate for CI
.github/workflows/agent-evals.yml
Each line carries the input, an ideal reference answer, and optional expected_tools for trace-level grading.
{"id": "refund-window", "input": "What's the refund window for digital goods?", "ideal": "14 days", "expected_tools": ["search_policy"]}
{"id": "plan-support", "input": "Does Pro include priority support?", "ideal": "yes", "expected_tools": ["search_policy"]}
{"id": "out-of-scope", "input": "What's the weather in Paris?", "ideal": "I can only help with account and billing questions.", "expected_tools": []}
{"id": "json-extract", "input": "Extract the order id and amount from: order ABC-991 for $42.50", "ideal": "{\"order_id\": \"ABC-991\", \"amount\": 42.50}", "expected_tools": []}
Run the agent under test and record a structured trace per sample.
# trace.py
from dataclasses import dataclass, field, asdict
@dataclass
class Trace:
sample_id: str
input: str
output: str
tool_calls: list[str] = field(default_factory=list)
ideal: str | None = None
expected_tools: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
return asdict(self)
def run_agent_on_dataset(dataset: list[dict], agent) -> list[Trace]:
traces: list[Trace] = []
for sample in dataset:
result = agent.run(sample["input"]) # your agent: returns output + tool_calls
traces.append(Trace(
sample_id=sample["id"],
input=sample["input"],
output=result.output,
tool_calls=[c.name for c in result.tool_calls],
ideal=sample.get("ideal"),
expected_tools=sample.get("expected_tools", []),
))
return traces
# graders/exact_match.py
from trace import Trace
def grade(trace: Trace) -> dict:
"""Pass if the ideal answer appears (normalized) in the output."""
expected = (trace.ideal or "").strip().lower()
actual = trace.output.strip().lower()
passed = expected in actual
return {
"grader": "string_contains",
"passed": passed,
"score": 1.0 if passed else 0.0,
"reason": "" if passed else f"expected to contain '{trace.ideal}', got '{trace.output[:120]}'",
}
# graders/json_schema_grader.py
import json
from trace import Trace
def grade(trace: Trace) -> dict:
"""Grade a JSON-extraction trace: valid JSON, right keys, right values,
AND the correct tools were called."""
reasons: list[str] = []
# 1. Output must parse as JSON matching the ideal.
try:
actual = json.loads(trace.output)
ideal = json.loads(trace.ideal)
if actual != ideal:
reasons.append(f"json mismatch: expected {ideal}, got {actual}")
except (json.JSONDecodeError, TypeError) as e:
reasons.append(f"output not valid JSON: {e}")
# 2. Trace-level check: exactly the expected tools were used.
if sorted(trace.tool_calls) != sorted(trace.expected_tools):
reasons.append(
f"tool calls {trace.tool_calls} != expected {trace.expected_tools}"
)
passed = not reasons
return {
"grader": "python_json+tools",
"passed": passed,
"score": 1.0 if passed else 0.0,
"reason": "; ".join(reasons),
}
Use a model grader only when correctness is a judgment call. Pin the model and force a structured verdict.
# graders/rubric_grader.py
import json
from openai import OpenAI
from trace import Trace
client = OpenAI()
GRADER_MODEL = "gpt-4o-mini" # pinned
RUBRIC = """You are grading a customer-support agent's answer.
Return JSON: {"passed": bool, "score": float 0-1, "reason": str}.
Criteria:
- ACCURATE: consistent with the reference answer.
- IN-SCOPE: only answers account/billing topics; refuses off-topic politely.
- NO HALLUCINATION: makes no claim absent from the reference.
Fail if any criterion is violated."""
def grade(trace: Trace) -> dict:
user = (f"Question: {trace.input}\n"
f"Reference answer: {trace.ideal}\n"
f"Agent answer: {trace.output}\n\nGrade it.")
resp = client.chat.completions.create(
model=GRADER_MODEL,
temperature=0,
response_format={"type": "json_object"},
messages=[{"role": "system", "content": RUBRIC},
{"role": "user", "content": user}],
)
verdict = json.loads(resp.choices[0].message.content)
return {
"grader": "model_rubric",
"passed": bool(verdict["passed"]),
"score": float(verdict["score"]),
"reason": verdict.get("reason", ""),
}
Map each dataset sample type to the grader that should score it. This config is the declarative description of the eval run.
{
"eval_name": "support_agent_v3",
"dataset": "evals/data/support_agent.jsonl",
"grader_model": "gpt-4o-mini",
"pass_rate_threshold": 0.9,
"grading_rules": [
{ "match_id_prefix": "json-", "grader": "graders.json_schema_grader" },
{ "match_id_prefix": "out-of-scope", "grader": "graders.rubric_grader" },
{ "default": true, "grader": "graders.exact_match" }
]
}
# run_eval.py
import importlib
import json
from trace import run_agent_on_dataset
from my_agent import build_agent
def load_jsonl(path: str) -> list[dict]:
with open(path) as f:
return [json.loads(line) for line in f if line.strip()]
def pick_grader(sample_id: str, rules: list[dict]):
for rule in rules:
prefix = rule.get("match_id_prefix")
if prefix and sample_id.startswith(prefix):
return importlib.import_module(rule["grader"]).grade
default = next(r for r in rules if r.get("default"))
return importlib.import_module(default["grader"]).grade
def main() -> None:
config = json.load(open("evals/config/support_agent.eval.json"))
dataset = load_jsonl(config["dataset"])
traces = run_agent_on_dataset(dataset, build_agent())
results = []
for trace in traces:
grade = pick_grader(trace.sample_id, config["grading_rules"])
result = grade(trace)
result["sample_id"] = trace.sample_id
results.append(result)
flag = "PASS" if result["passed"] else "FAIL"
print(f"[{flag}] {trace.sample_id:20s} score={result['score']:.2f} {result['reason']}")
passed = sum(r["passed"] for r in results)
pass_rate = passed / len(results)
report = {
"eval_name": config["eval_name"],
"pass_rate": round(pass_rate, 4),
"threshold": config["pass_rate_threshold"],
"n": len(results),
"results": results,
}
json.dump(report, open("results.json", "w"), indent=2)
print(f"\nPass rate: {pass_rate:.1%} (threshold {config['pass_rate_threshold']:.0%})")
if __name__ == "__main__":
main()
# gate.py
import json
import sys
report = json.load(open("results.json"))
if report["pass_rate"] < report["threshold"]:
failed = [r["sample_id"] for r in report["results"] if not r["passed"]]
print(f"EVAL FAILED: {report['pass_rate']:.1%} < {report['threshold']:.0%}")
print("Failing samples: " + ", ".join(failed))
sys.exit(1)
print(f"EVAL PASSED: {report['pass_rate']:.1%}")
# .github/workflows/agent-evals.yml
name: Agent Evals
on:
pull_request:
paths: ["src/agent/**", "src/prompts/**", "evals/**"]
jobs:
evals:
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: "3.12" }
- run: pip install -r requirements.txt # includes openai
- name: Run evals
env: { OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" }
run: python evals/run_eval.py
- name: Gate on pass rate
run: python evals/gate.py
- uses: actions/upload-artifact@v4
if: always()
with: { name: eval-results, path: results.json }
response_format=json_object plus an explicit reason field makes failures debuggable.== could do. It is slower, costs money, and is less reliable than the deterministic option.Trigger when the user asks to:
For RAG-specific metrics (faithfulness, context precision/recall), use the RAG Evaluation Metrics skill. For long-term drift gating of RAG pipelines, use the RAG Regression Testing skill.
- name: Install QA Skills
run: npx @qaskills/cli add openai-evals-trace-grading12 of 29 agents supported