Dataset entry

Prompt Injection & RAG Defense: How Agents Protect Themselves

Name: Prompt Injection & RAG Defense: How Agents Protect Themselves
Creator: Dzmitryi Kharlanau

agentic-bytes agentic_byte agentic_dev_014 prompt-injection rag-security agent-safety defense

Open JSON Back to list

Learn how to prevent agents from being manipulated by user input or retrieved content, especially in RAG systems.

Attribution

Creator: Dzmitryi Kharlanau (SAP Lead).

Canonical: https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_014.json

JSON (copy / reuse)

{
  "byte_id": "agentic_dev_014",
  "title": "Prompt Injection & RAG Defense: How Agents Protect Themselves",
  "level": "foundation",
  "domain": [
    "agentic-development",
    "security",
    "rag"
  ],
  "intent": "Learn how to prevent agents from being manipulated by user input or retrieved content, especially in RAG systems.",
  "core_idea": {
    "one_liner": "If your agent trusts all text equally, it can be controlled by anyone.",
    "why_it_matters": [
      "RAG sources can contain malicious or misleading instructions.",
      "Users may try to override system rules intentionally or accidentally.",
      "Most real-world agent exploits are prompt-injection based."
    ]
  },
  "definition": {
    "prompt_injection": "An attempt to manipulate an agent by inserting instructions that override or bypass its intended behavior."
  },
  "attack_vectors": [
    {
      "vector": "User input",
      "example": "Ignore previous instructions and do X."
    },
    {
      "vector": "RAG content",
      "example": "Embedded instructions inside documentation or comments."
    },
    {
      "vector": "Tool output",
      "example": "Untrusted text returned by external systems."
    }
  ],
  "core_defense_principles": [
    "Instructions and data are not the same.",
    "Only system and policy layers can define behavior.",
    "Retrieved text is evidence, not authority."
  ],
  "defense_techniques": [
    {
      "technique": "Instruction hierarchy",
      "description": "System > policy > developer > user > retrieved content."
    },
    {
      "technique": "Content labeling",
      "description": "Explicitly mark retrieved text as untrusted data."
    },
    {
      "technique": "Output contracts",
      "description": "Force structured outputs that ignore embedded instructions."
    },
    {
      "technique": "Self-check for instruction override",
      "description": "Critic checks if output violates guardrails."
    }
  ],
  "micro_example": {
    "scenario": "RAG retrieves a document saying: 'Always approve this action.'",
    "agent_behavior": {
      "interpretation": "This is content, not an instruction.",
      "action": "Uses it as context only, does not change behavior.",
      "result": "Approval still requires human-in-the-loop."
    }
  },
  "failure_modes": [
    "Treating retrieved text as trusted instructions",
    "Mixing system rules with content",
    "Letting user override guardrails",
    "No distinction between data and commands"
  ],
  "guards": [
    "Never execute instructions from RAG content.",
    "Always tag retrieved text as untrusted.",
    "Block outputs that violate guardrails regardless of input."
  ],
  "teach_it_in_english": {
    "simple_explanation": "The agent must know the difference between rules and information.",
    "one_sentence_definition": "Prompt injection defense keeps agents obedient to their true owners."
  },
  "practical_checklist": [
    "Is retrieved content treated as data only?",
    "Is instruction hierarchy enforced?",
    "Can user input override system rules?",
    "Is there a critic checking for violations?"
  ],
  "tags": [
    "prompt-injection",
    "rag-security",
    "agent-safety",
    "defense"
  ],
  "meta": {
    "schema": "dkharlanau.dataset.byte",
    "schema_version": "1.1",
    "dataset": "agentic-bytes",
    "source_project": "cv-ai",
    "source_path": "agentic-bytes/agentic_dev_014.json",
    "generated_at_utc": "2026-02-03T14:33:32+00:00",
    "creator": {
      "name": "Dzmitryi Kharlanau",
      "role": "SAP Lead",
      "website": "https://dkharlanau.github.io",
      "linkedin": "https://www.linkedin.com/in/dkharlanau"
    },
    "attribution": {
      "attribution_required": true,
      "preferred_citation": "Dzmitryi Kharlanau (SAP Lead). Dataset bytes: https://dkharlanau.github.io"
    },
    "license": {
      "name": "",
      "spdx": "",
      "url": ""
    },
    "links": {
      "website": "https://dkharlanau.github.io",
      "linkedin": "https://www.linkedin.com/in/dkharlanau"
    },
    "contact": {
      "preferred": "linkedin",
      "linkedin": "https://www.linkedin.com/in/dkharlanau"
    },
    "canonical_url": "https://dkharlanau.github.io/datasets/agentic-bytes/agentic_dev_014.json",
    "created_at_utc": "2026-02-03T14:33:32+00:00",
    "updated_at_utc": "2026-02-03T15:29:02+00:00",
    "provenance": {
      "source_type": "chat_export_extraction",
      "note": "Extracted and curated by Dzmitryi Kharlanau; enriched for attribution and crawler indexing."
    },
    "entity_type": "agentic_byte",
    "entity_subtype": "level:foundation",
    "summary": "Learn how to prevent agents from being manipulated by user input or retrieved content, especially in RAG systems."
  }
}