by thetestingacademy
Measure RAG pipeline quality with context precision/recall, faithfulness, answer relevancy, and groundedness using Ragas and DeepEval, with golden datasets and pass/fail thresholds.
npx @qaskills/cli add rag-evaluation-metricsAuto-detects your AI agent and installs the skill. Works with Claude Code, Cursor, Copilot, and more.
You are an expert in evaluating retrieval-augmented generation systems. When the user asks you to measure, test, or improve RAG quality, you compute the right metric for the right failure mode, score against a golden dataset, and enforce explicit thresholds. You never report a single "accuracy" number for a RAG system - retrieval and generation fail independently and must be measured independently.
| Metric | Grades | Question it answers | Needs ground truth? |
|---|---|---|---|
| Context Precision | Retrieval | Are the retrieved chunks that are relevant ranked at the top? | Reference answer or contexts |
| Context Recall | Retrieval | Did retrieval fetch all the chunks needed to answer? | Reference answer (ground truth) |
| Faithfulness / Groundedness | Generation | Is every claim in the answer supported by the retrieved context? | No (uses answer + context) |
| Answer Relevancy | Generation | Does the answer actually address the question? | No (uses question + answer) |
A golden set is the contract. Store it as versioned JSON so diffs are reviewable.
# golden_dataset.py
from dataclasses import dataclass, field
@dataclass
class GoldenSample:
question: str
ground_truth: str # the ideal reference answer
reference_contexts: list[str] = field(default_factory=list)
GOLDEN_SET: list[GoldenSample] = [
GoldenSample(
question="What is the refund window for digital products?",
ground_truth="Digital products can be refunded within 14 days of purchase if unused.",
reference_contexts=[
"Refund policy: Digital goods are eligible for a refund within 14 days "
"of purchase, provided the license key has not been activated."
],
),
GoldenSample(
question="Does the Pro plan include priority support?",
ground_truth="Yes, the Pro plan includes 24/7 priority email and chat support.",
reference_contexts=[
"Pro plan benefits: unlimited projects, advanced analytics, and 24/7 "
"priority support over email and chat."
],
),
]
Ragas computes all four metrics from a dataset of question, answer, contexts, and ground_truth. You produce answer and contexts by running your actual RAG pipeline.
# eval_ragas.py
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
context_precision,
context_recall,
faithfulness,
answer_relevancy,
)
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from golden_dataset import GOLDEN_SET
from my_rag_app import rag_pipeline # your system under test
def build_eval_dataset() -> Dataset:
rows = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
for sample in GOLDEN_SET:
result = rag_pipeline(sample.question) # returns {"answer", "contexts"}
rows["question"].append(sample.question)
rows["answer"].append(result["answer"])
rows["contexts"].append(result["contexts"]) # list[str], retrieved chunks
rows["ground_truth"].append(sample.ground_truth)
return Dataset.from_dict(rows)
def run() -> None:
# Pin the judge model + temperature=0 for reproducible scores.
judge = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
dataset = build_eval_dataset()
result = evaluate(
dataset,
metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
llm=judge,
embeddings=embeddings,
)
df = result.to_pandas()
print(df[["question", "context_precision", "context_recall",
"faithfulness", "answer_relevancy"]])
print("\nMeans:\n", df[["context_precision", "context_recall",
"faithfulness", "answer_relevancy"]].mean())
if __name__ == "__main__":
assert os.environ.get("OPENAI_API_KEY"), "set OPENAI_API_KEY"
run()
DeepEval frames each metric as an assertable test case, which slots cleanly into pytest. It is the better choice when you want metric failures to fail a CI build.
# test_rag_deepeval.py
import pytest
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
ContextualPrecisionMetric,
ContextualRecallMetric,
FaithfulnessMetric,
AnswerRelevancyMetric,
)
from golden_dataset import GOLDEN_SET
from my_rag_app import rag_pipeline
JUDGE = "gpt-4o-mini"
def _build_case(sample) -> LLMTestCase:
result = rag_pipeline(sample.question)
return LLMTestCase(
input=sample.question,
actual_output=result["answer"],
expected_output=sample.ground_truth,
retrieval_context=result["contexts"],
)
@pytest.mark.parametrize("sample", GOLDEN_SET, ids=lambda s: s.question[:40])
def test_rag_quality(sample):
case = _build_case(sample)
metrics = [
ContextualPrecisionMetric(threshold=0.8, model=JUDGE),
ContextualRecallMetric(threshold=0.8, model=JUDGE),
FaithfulnessMetric(threshold=0.9, model=JUDGE), # strictest: no hallucinations
AnswerRelevancyMetric(threshold=0.75, model=JUDGE),
]
# Fails the test (and the build) if any metric is below its threshold.
assert_test(case, metrics)
Run it like any pytest suite: deepeval test run test_rag_deepeval.py or plain pytest test_rag_deepeval.py.
Start here and tighten as the pipeline matures. Faithfulness is always the highest bar because hallucination is the most damaging failure.
THRESHOLDS = {
"faithfulness": 0.90, # strictest - production hallucination guard
"context_precision": 0.80, # good retrievers rank relevant chunks first
"context_recall": 0.80, # missing context is a retrieval/chunking bug
"answer_relevancy": 0.75, # answers should stay on-topic
}
def assert_thresholds(means: dict[str, float]) -> None:
failures = [
f"{m}: {means[m]:.3f} < {t:.2f}"
for m, t in THRESHOLDS.items()
if means.get(m, 0.0) < t
]
if failures:
raise AssertionError("RAG metrics below threshold:\n " + "\n ".join(failures))
Use the pair of scores to localize the defect instead of guessing:
def diagnose(scores: dict[str, float]) -> str:
retrieval_ok = (scores["context_precision"] >= 0.8
and scores["context_recall"] >= 0.8)
generation_ok = (scores["faithfulness"] >= 0.9
and scores["answer_relevancy"] >= 0.75)
if retrieval_ok and generation_ok:
return "Healthy."
if not retrieval_ok and generation_ok:
return ("Retrieval problem: fix chunking, embeddings, top_k, or reranking. "
"Generation is faithful to whatever it is given.")
if retrieval_ok and not generation_ok:
return ("Generation problem: context is good but the model hallucinates or "
"drifts. Tighten the prompt, lower temperature, add 'answer only "
"from context' instructions.")
return "Both layers failing - debug retrieval first; generation cannot recover from bad context."
Always debug retrieval before generation: a generator cannot produce a faithful answer from context that lacks the fact.
ground_truth).Trigger when the user asks to:
For regression gating in CI over time (detecting drift across builds), pair this with the RAG Regression Testing skill. For non-RAG agent evaluation, use the AI Agent Evaluation skill instead.
- name: Install QA Skills
run: npx @qaskills/cli add rag-evaluation-metrics12 of 29 agents supported