Learn how to structure knowledge so an agent can reliably retrieve and use it without confusion or hallucination.
Attribution
Creator: Dzmitryi Kharlanau (SAP Lead).
Canonical: https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_004.json
JSON (copy / reuse)
{
"byte_id": "agentic_dev_004",
"title": "Chunking: How Knowledge Must Be Cut for RAG",
"level": "foundation",
"domain": [
"agentic-development",
"rag",
"knowledge-engineering"
],
"intent": "Learn how to structure knowledge so an agent can reliably retrieve and use it without confusion or hallucination.",
"core_idea": {
"one_liner": "RAG does not fail because of models — it fails because of bad chunking.",
"why_it_matters": [
"The model can only reason over what it retrieves.",
"Poor chunking causes partial context and wrong conclusions.",
"Good chunks turn documents into reusable knowledge units."
]
},
"definition": {
"chunk": "A self-contained unit of knowledge that can be retrieved and understood independently."
},
"golden_rules": [
"One chunk = one idea.",
"A chunk must make sense without neighboring text.",
"If you cannot explain a chunk in 30 seconds, it is too big."
],
"recommended_chunk_sizes": {
"concepts": "150–300 tokens",
"procedures_checklists": "200–400 tokens",
"decision_rules": "100–250 tokens",
"reference_tables": "as rows, not prose blocks"
},
"bad_chunking_patterns": [
"Splitting by fixed token size only",
"Cutting mid-sentence or mid-idea",
"One chunk covering multiple decisions",
"Large narrative documents with no internal structure"
],
"good_chunking_patterns": [
"Semantic boundaries (concept, rule, checklist, example)",
"Stable templates (same fields every time)",
"Explicit titles and summaries per chunk"
],
"agent_friendly_templates": [
"Decision Byte",
"Anti-pattern Byte",
"Checklist Byte",
"Mapping Byte",
"RCA Byte"
],
"micro_example": {
"scenario": "MDG replication troubleshooting guide",
"bad_chunk": "10 pages covering queues, errors, mappings, governance mixed together.",
"good_chunks": [
"Queue backlog diagnosis",
"Web service error patterns",
"Value mapping failures",
"Authorization-related replication blocks"
]
},
"retrieval_failure_modes": [
"Right document, wrong chunk",
"Partial rule without conditions",
"Example retrieved without explanation",
"Conflicting chunks retrieved together"
],
"guards": [
"Each chunk must have a title and intent.",
"Chunks must be versioned when meaning changes.",
"Never mix procedures and opinions in the same chunk."
],
"teach_it_in_english": {
"simple_explanation": "Chunking is like cutting a book into index cards that still make sense on their own.",
"one_sentence_definition": "A good chunk is the smallest unit of knowledge that still tells the whole truth."
},
"practical_checklist": [
"Does this chunk answer one clear question?",
"Can it stand alone without other chunks?",
"Is its size appropriate for retrieval?",
"Would I reuse this chunk in another agent?"
],
"tags": [
"rag",
"chunking",
"knowledge-design",
"retrieval-quality"
],
"meta": {
"schema": "dkharlanau.dataset.byte",
"schema_version": "1.1",
"dataset": "agentic-bytes",
"source_project": "cv-ai",
"source_path": "agentic-bytes/agentic_dev_004.json",
"generated_at_utc": "2026-02-03T14:33:32+00:00",
"creator": {
"name": "Dzmitryi Kharlanau",
"role": "SAP Lead",
"website": "https://dkharlanau.github.io",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"attribution": {
"attribution_required": true,
"preferred_citation": "Dzmitryi Kharlanau (SAP Lead). Dataset bytes: https://dkharlanau.github.io"
},
"license": {
"name": "",
"spdx": "",
"url": ""
},
"links": {
"website": "https://dkharlanau.github.io",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"contact": {
"preferred": "linkedin",
"linkedin": "https://www.linkedin.com/in/dkharlanau"
},
"canonical_url": "https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_004.json",
"created_at_utc": "2026-02-03T14:33:32+00:00",
"updated_at_utc": "2026-02-03T15:29:02+00:00",
"provenance": {
"source_type": "chat_export_extraction",
"note": "Extracted and curated by Dzmitryi Kharlanau; enriched for attribution and crawler indexing."
},
"entity_type": "agentic_byte",
"entity_subtype": "level:foundation",
"summary": "Learn how to structure knowledge so an agent can reliably retrieve and use it without confusion or hallucination."
}
}