Learn how to evaluate agents systematically so improvements do not break existing behavior.
Attribution
Creator: Dzmitryi Kharlanau (SAP Lead).
Canonical: https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_011.json
JSON (copy / reuse)
{
"byte_id": "agentic_dev_011",
"title": "Golden Set & Evals: How to Know Your Agent Works",
"level": "foundation",
"domain": [
"agentic-development",
"evaluation",
"quality"
],
"intent": "Learn how to evaluate agents systematically so improvements do not break existing behavior.",
"core_idea": {
"one_liner": "If you cannot measure agent quality, you cannot improve it.",
"why_it_matters": [
"Agents regress silently after prompt or knowledge changes.",
"Human impressions are inconsistent.",
"Evaluation turns agent development into engineering."
]
},
"definition": {
"golden_set": "A curated set of representative questions and tasks with expected outcomes used for repeatable evaluation.",
"eval": "A process that compares agent outputs against expectations using clear criteria."
},
"what_a_good_golden_set_contains": [
"Easy, medium, and hard cases",
"Edge cases and failure scenarios",
"Ambiguous inputs",
"Previously broken cases (regressions)"
],
"eval_dimensions": [
{
"dimension": "Correctness",
"question": "Is the answer factually and logically correct?"
},
{
"dimension": "Grounding",
"question": "Are claims supported by retrieved knowledge or tools?"
},
{
"dimension": "Safety",
"question": "Did the agent respect guardrails?"
},
{
"dimension": "Usefulness",
"question": "Would a real user consider this helpful?"
},
{
"dimension": "Cost & latency",
"question": "Is the quality acceptable for the time and cost?"
}
],
"eval_patterns": [
{
"pattern": "Offline eval",
"description": "Run the agent against the golden set after any change."
},
{
"pattern": "Regression eval",
"description": "Ensure old correct answers stay correct."
},
{
"pattern": "Shadow eval",
"description": "Test a new version in parallel without affecting users."
}
],
"micro_example": {
"scenario": "MDG replication troubleshooting agent",
"golden_case": {
"input": "Replication delayed for only one business partner type",
"expected": "Agent asks for object-specific filters and mapping checks",
"failure_signal": "Agent suggests generic system performance issues"
}
},
"failure_modes": [
"Golden set too small or too clean",
"Evaluating only happy paths",
"Changing expectations without versioning",
"Ignoring eval failures under time pressure"
],
"guards": [
"Every agent change must run evals.",
"Eval results must be stored and compared over time.",
"Failing evals block release."
],
"teach_it_in_english": {
"simple_explanation": "A golden set is a test exam your agent must pass every time.",
"one_sentence_definition": "Evals turn agent behavior into something you can trust."
},
"practical_checklist": [
"Do we have representative test cases?",
"Are failures actionable?",
"Do evals run automatically?",
"Can we detect regressions early?"
],
"tags": [
"evaluation",
"golden-set",
"regression",
"agent-quality"
],
"meta": {
"schema": "dkharlanau.dataset.byte",
"schema_version": "1.1",
"dataset": "agentic-bytes",
"source_project": "cv-ai",
"source_path": "agentic-bytes/agentic_dev_011.json",
"generated_at_utc": "2026-02-03T14:33:32+00:00",
"creator": {
"name": "Dzmitryi Kharlanau",
"role": "SAP Lead",
"website": "https://dkharlanau.github.io",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"attribution": {
"attribution_required": true,
"preferred_citation": "Dzmitryi Kharlanau (SAP Lead). Dataset bytes: https://dkharlanau.github.io"
},
"license": {
"name": "",
"spdx": "",
"url": ""
},
"links": {
"website": "https://dkharlanau.github.io",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"contact": {
"preferred": "linkedin",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"canonical_url": "https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_011.json",
"created_at_utc": "2026-02-03T14:33:32+00:00",
"updated_at_utc": "2026-02-03T15:29:02+00:00",
"provenance": {
"source_type": "chat_export_extraction",
"note": "Extracted and curated by Dzmitryi Kharlanau; enriched for attribution and crawler indexing."
},
"entity_type": "agentic_byte",
"entity_subtype": "level:foundation",
"summary": "Learn how to evaluate agents systematically so improvements do not break existing behavior."
}
}