CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/rag-architect/rag_pipeline_designer.py

#!/usr/bin/env python3
"""
RAG Pipeline Designer - Designs complete RAG pipelines based on requirements.

This script analyzes requirements and generates a comprehensive RAG pipeline design
including architecture diagrams, component recommendations, configuration templates,
and cost projections.

Components designed:
- Chunking strategy recommendation
- Embedding model selection
- Vector database recommendation
- Retrieval approach (dense/sparse/hybrid)
- Reranking configuration
- Evaluation framework setup
- Production deployment patterns

No external dependencies - uses only Python standard library.
"""

import argparse
import json
import math
import os
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass, asdict
from enum import Enum


class Scale(Enum):
    """System scale categories."""
    SMALL = "small"      # < 1M documents, < 1K queries/day
    MEDIUM = "medium"    # 1M-100M documents, 1K-100K queries/day
    LARGE = "large"      # 100M+ documents, 100K+ queries/day


class DocumentType(Enum):
    """Document type categories."""
    TEXT = "text"                # Plain text, articles
    TECHNICAL = "technical"      # Documentation, manuals
    CODE = "code"               # Source code files
    SCIENTIFIC = "scientific"    # Research papers, journals
    LEGAL = "legal"             # Legal documents, contracts
    MIXED = "mixed"             # Multiple document types


class Latency(Enum):
    """Latency requirements."""
    REAL_TIME = "real_time"      # < 100ms
    INTERACTIVE = "interactive"   # < 500ms
    BATCH = "batch"              # > 1s acceptable


@dataclass
class Requirements:
    """RAG system requirements."""
    document_types: List[str]
    document_count: int
    avg_document_size: int  # characters
    queries_per_day: int
    query_patterns: List[str]  # e.g., ["factual", "conversational", "analytical"]
    latency_requirement: str
    budget_monthly: float  # USD
    accuracy_priority: float  # 0-1 scale
    cost_priority: float     # 0-1 scale
    maintenance_complexity: str  # "low", "medium", "high"


@dataclass
class ComponentRecommendation:
    """Recommendation for a pipeline component."""
    name: str
    type: str
    config: Dict[str, Any]
    rationale: str
    pros: List[str]
    cons: List[str]
    cost_monthly: float


@dataclass
class PipelineDesign:
    """Complete RAG pipeline design."""
    chunking: ComponentRecommendation
    embedding: ComponentRecommendation
    vector_db: ComponentRecommendation
    retrieval: ComponentRecommendation
    reranking: Optional[ComponentRecommendation]
    evaluation: ComponentRecommendation
    total_cost: float
    architecture_diagram: str
    config_templates: Dict[str, Any]


class RAGPipelineDesigner:
    """Main pipeline designer class."""

    def __init__(self):
        self.embedding_models = self._load_embedding_models()
        self.vector_databases = self._load_vector_databases()
        self.chunking_strategies = self._load_chunking_strategies()

    def design_pipeline(self, requirements: Requirements) -> PipelineDesign:
        """Design complete RAG pipeline based on requirements."""
        print(f"Designing RAG pipeline for {requirements.document_count:,} documents...")

        # Determine system scale
        scale = self._determine_scale(requirements)
        print(f"System scale: {scale.value}")

        # Design each component
        chunking = self._recommend_chunking(requirements, scale)
        embedding = self._recommend_embedding(requirements, scale)
        vector_db = self._recommend_vector_db(requirements, scale)
        retrieval = self._recommend_retrieval(requirements, scale)
        reranking = self._recommend_reranking(requirements, scale)
        evaluation = self._recommend_evaluation(requirements, scale)

        # Calculate total cost
        total_cost = (chunking.cost_monthly + embedding.cost_monthly +
                     vector_db.cost_monthly + retrieval.cost_monthly +
                     evaluation.cost_monthly)
        if reranking:
            total_cost += reranking.cost_monthly

        # Generate architecture diagram
        architecture = self._generate_architecture_diagram(
            chunking, embedding, vector_db, retrieval, reranking, evaluation
        )

        # Generate configuration templates
        configs = self._generate_config_templates(
            chunking, embedding, vector_db, retrieval, reranking, evaluation
        )

        return PipelineDesign(
            chunking=chunking,
            embedding=embedding,
            vector_db=vector_db,
            retrieval=retrieval,
            reranking=reranking,
            evaluation=evaluation,
            total_cost=total_cost,
            architecture_diagram=architecture,
            config_templates=configs
        )

    def _determine_scale(self, req: Requirements) -> Scale:
        """Determine system scale based on requirements."""
        if req.document_count < 1_000_000 and req.queries_per_day < 1_000:
            return Scale.SMALL
        elif req.document_count < 100_000_000 and req.queries_per_day < 100_000:
            return Scale.MEDIUM
        else:
            return Scale.LARGE

    def _recommend_chunking(self, req: Requirements, scale: Scale) -> ComponentRecommendation:
        """Recommend chunking strategy."""
        doc_types = set(req.document_types)

        if "code" in doc_types:
            strategy = "semantic_code_aware"
            config = {"max_size": 1000, "preserve_functions": True, "overlap": 50}
            rationale = "Code documents benefit from function/class boundary awareness"
        elif "technical" in doc_types or "scientific" in doc_types:
            strategy = "semantic_heading_aware"
            config = {"max_size": 1500, "heading_weight": 2.0, "overlap": 100}
            rationale = "Technical documents have clear hierarchical structure"
        elif len(doc_types) > 2 or "mixed" in doc_types:
            strategy = "adaptive_chunking"
            config = {"strategies": ["paragraph", "sentence", "fixed"], "auto_select": True}
            rationale = "Mixed document types require adaptive strategy selection"
        else:
            if req.avg_document_size > 5000:
                strategy = "paragraph_based"
                config = {"max_size": 2000, "min_paragraph_size": 100}
                rationale = "Large documents benefit from paragraph-based chunking"
            else:
                strategy = "sentence_based"
                config = {"max_size": 1000, "sentence_overlap": 1}
                rationale = "Small to medium documents work well with sentence chunking"

        return ComponentRecommendation(
            name=strategy,
            type="chunking",
            config=config,
            rationale=rationale,
            pros=self._get_chunking_pros(strategy),
            cons=self._get_chunking_cons(strategy),
            cost_monthly=0.0  # Processing cost only
        )

    def _recommend_embedding(self, req: Requirements, scale: Scale) -> ComponentRecommendation:
        """Recommend embedding model."""
        doc_types = set(req.document_types)

        # Consider accuracy vs cost priority
        high_accuracy = req.accuracy_priority > 0.7
        cost_sensitive = req.cost_priority > 0.6

        if "code" in doc_types:
            if high_accuracy and not cost_sensitive:
                model = "openai-code-search-ada-002"
                cost_per_1k_tokens = 0.0001
                dimensions = 1536
            else:
                model = "sentence-transformers/code-bert-base"
                cost_per_1k_tokens = 0.0  # Self-hosted
                dimensions = 768
        elif "scientific" in doc_types:
            if high_accuracy:
                model = "openai-text-embedding-ada-002"
                cost_per_1k_tokens = 0.0001
                dimensions = 1536
            else:
                model = "sentence-transformers/scibert-nli"
                cost_per_1k_tokens = 0.0
                dimensions = 768
        else:
            if cost_sensitive or scale == Scale.SMALL:
                model = "sentence-transformers/all-MiniLM-L6-v2"
                cost_per_1k_tokens = 0.0
                dimensions = 384
            elif high_accuracy:
                model = "openai-text-embedding-ada-002"
                cost_per_1k_tokens = 0.0001
                dimensions = 1536
            else:
                model = "sentence-transformers/all-mpnet-base-v2"
                cost_per_1k_tokens = 0.0
                dimensions = 768

        # Calculate monthly embedding cost
        total_tokens = req.document_count * (req.avg_document_size / 4)  # ~4 chars per token
        query_tokens = req.queries_per_day * 30 * 20  # ~20 tokens per query per month
        monthly_cost = (total_tokens + query_tokens) * cost_per_1k_tokens / 1000

        return ComponentRecommendation(
            name=model,
            type="embedding",
            config={
                "model": model,
                "dimensions": dimensions,
                "batch_size": 100 if scale == Scale.SMALL else 1000,
                "cache_embeddings": True
            },
            rationale=f"Selected for {doc_types} with accuracy priority {req.accuracy_priority}",
            pros=self._get_embedding_pros(model),
            cons=self._get_embedding_cons(model),
            cost_monthly=monthly_cost
        )

    def _recommend_vector_db(self, req: Requirements, scale: Scale) -> ComponentRecommendation:
        """Recommend vector database."""
        if scale == Scale.SMALL and req.cost_priority > 0.7:
            db = "chroma"
            cost = 0.0
            rationale = "Local/embedded database suitable for small scale and cost optimization"
        elif scale == Scale.SMALL and req.maintenance_complexity == "low":
            db = "pgvector"
            cost = 50.0  # PostgreSQL hosting
            rationale = "Leverage existing PostgreSQL infrastructure"
        elif scale == Scale.LARGE or req.latency_requirement == "real_time":
            db = "pinecone"
            vectors = req.document_count * 2  # Account for chunking
            cost = max(70, vectors * 0.00005)  # $70 base + $0.00005 per vector
            rationale = "Managed service with excellent performance for large scale"
        elif req.maintenance_complexity == "low":
            db = "weaviate_cloud"
            vectors = req.document_count * 2
            cost = max(25, vectors * 0.00003)
            rationale = "Managed Weaviate with good balance of features and cost"
        else:
            db = "qdrant"
            cost = 100.0  # Self-hosted infrastructure estimate
            rationale = "High performance self-hosted option with good scaling"

        return ComponentRecommendation(
            name=db,
            type="vector_database",
            config=self._get_vector_db_config(db, req, scale),
            rationale=rationale,
            pros=self._get_vector_db_pros(db),
            cons=self._get_vector_db_cons(db),
            cost_monthly=cost
        )

    def _recommend_retrieval(self, req: Requirements, scale: Scale) -> ComponentRecommendation:
        """Recommend retrieval strategy."""
        if req.accuracy_priority > 0.8:
            strategy = "hybrid"
            rationale = "Hybrid retrieval for maximum accuracy combining dense and sparse methods"
        elif "technical" in req.document_types or "code" in req.document_types:
            strategy = "hybrid"
            rationale = "Technical content benefits from both semantic and keyword matching"
        elif req.latency_requirement == "real_time":
            strategy = "dense"
            rationale = "Dense retrieval faster for real-time requirements"
        else:
            strategy = "dense"
            rationale = "Dense retrieval suitable for general text search"

        return ComponentRecommendation(
            name=strategy,
            type="retrieval",
            config={
                "strategy": strategy,
                "dense_weight": 0.7 if strategy == "hybrid" else 1.0,
                "sparse_weight": 0.3 if strategy == "hybrid" else 0.0,
                "top_k": 20 if req.accuracy_priority > 0.7 else 10,
                "similarity_threshold": 0.7
            },
            rationale=rationale,
            pros=self._get_retrieval_pros(strategy),
            cons=self._get_retrieval_cons(strategy),
            cost_monthly=0.0
        )

    def _recommend_reranking(self, req: Requirements, scale: Scale) -> Optional[ComponentRecommendation]:
        """Recommend reranking if beneficial."""
        if req.accuracy_priority < 0.6 or req.latency_requirement == "real_time":
            return None

        if req.cost_priority > 0.8:
            return None

        # Estimate reranking queries per month
        monthly_queries = req.queries_per_day * 30
        cost_per_query = 0.002  # Estimated cost for cross-encoder reranking
        monthly_cost = monthly_queries * cost_per_query

        if monthly_cost > req.budget_monthly * 0.3:  # Don't exceed 30% of budget
            return None

        return ComponentRecommendation(
            name="cross_encoder_reranking",
            type="reranking",
            config={
                "model": "cross-encoder/ms-marco-MiniLM-L-12-v2",
                "rerank_top_k": 20,
                "return_top_k": 5,
                "batch_size": 16
            },
            rationale="Reranking improves precision for high-accuracy requirements",
            pros=["Higher precision", "Better ranking quality", "Handles complex queries"],
            cons=["Additional latency", "Higher cost", "More complexity"],
            cost_monthly=monthly_cost
        )

    def _recommend_evaluation(self, req: Requirements, scale: Scale) -> ComponentRecommendation:
        """Recommend evaluation framework."""
        return ComponentRecommendation(
            name="comprehensive_evaluation",
            type="evaluation",
            config={
                "metrics": ["precision@k", "recall@k", "mrr", "ndcg"],
                "k_values": [1, 3, 5, 10],
                "faithfulness_check": True,
                "relevance_scoring": True,
                "evaluation_frequency": "weekly" if scale == Scale.LARGE else "monthly",
                "sample_size": min(1000, req.queries_per_day * 7)
            },
            rationale="Comprehensive evaluation essential for production RAG systems",
            pros=["Quality monitoring", "Performance tracking", "Issue detection"],
            cons=["Additional overhead", "Requires ground truth data"],
            cost_monthly=20.0  # Evaluation tooling and compute
        )

    def _generate_architecture_diagram(self, chunking: ComponentRecommendation,
                                     embedding: ComponentRecommendation,
                                     vector_db: ComponentRecommendation,
                                     retrieval: ComponentRecommendation,
                                     reranking: Optional[ComponentRecommendation],
                                     evaluation: ComponentRecommendation) -> str:
        """Generate Mermaid architecture diagram."""

        diagram = """```mermaid
graph TB
    %% Document Processing Pipeline
    A[Document Corpus] --> B[Document Chunking]
    B --> C[Embedding Generation]
    C --> D[Vector Database Storage]

    %% Query Processing Pipeline
    E[User Query] --> F[Query Processing]
    F --> G[Vector Search]
    D --> G
    G --> H[Retrieved Chunks]
"""

        if reranking:
            diagram += "    H --> I[Reranking]\n    I --> J[Final Results]\n"
        else:
            diagram += "    H --> J[Final Results]\n"

        diagram += """
    %% Evaluation Pipeline
    J --> K[Response Generation]
    K --> L[Evaluation Metrics]

    %% Component Details
    B -.-> B1[Strategy: """ + chunking.name + """]
    C -.-> C1[Model: """ + embedding.name + """]
    D -.-> D1[Database: """ + vector_db.name + """]
    G -.-> G1[Method: """ + retrieval.name + """]
"""

        if reranking:
            diagram += "    I -.-> I1[Model: " + reranking.name + "]\n"

        diagram += "    L -.-> L1[Framework: " + evaluation.name + "]\n```"

        return diagram

    def _generate_config_templates(self, *components) -> Dict[str, Any]:
        """Generate configuration templates for all components."""
        configs = {}

        for component in components:
            if component:
                configs[component.type] = {
                    "component": component.name,
                    "config": component.config,
                    "rationale": component.rationale
                }

        # Add deployment configuration
        configs["deployment"] = {
            "infrastructure": "cloud" if any("pinecone" in str(c.name) for c in components if c) else "hybrid",
            "scaling": {
                "auto_scaling": True,
                "min_replicas": 1,
                "max_replicas": 10
            },
            "monitoring": {
                "metrics": ["latency", "throughput", "accuracy"],
                "alerts": ["high_latency", "low_accuracy", "service_down"]
            }
        }

        return configs

    def _load_embedding_models(self) -> Dict[str, Dict[str, Any]]:
        """Load embedding model specifications."""
        return {
            "openai-text-embedding-ada-002": {
                "dimensions": 1536,
                "cost_per_1k_tokens": 0.0001,
                "quality": "high",
                "speed": "medium"
            },
            "sentence-transformers/all-mpnet-base-v2": {
                "dimensions": 768,
                "cost_per_1k_tokens": 0.0,
                "quality": "high",
                "speed": "medium"
            },
            "sentence-transformers/all-MiniLM-L6-v2": {
                "dimensions": 384,
                "cost_per_1k_tokens": 0.0,
                "quality": "medium",
                "speed": "fast"
            }
        }

    def _load_vector_databases(self) -> Dict[str, Dict[str, Any]]:
        """Load vector database specifications."""
        return {
            "pinecone": {"managed": True, "scaling": "excellent", "cost": "high"},
            "weaviate": {"managed": False, "scaling": "good", "cost": "medium"},
            "qdrant": {"managed": False, "scaling": "excellent", "cost": "low"},
            "chroma": {"managed": False, "scaling": "poor", "cost": "free"},
            "pgvector": {"managed": False, "scaling": "good", "cost": "medium"}
        }

    def _load_chunking_strategies(self) -> Dict[str, Dict[str, Any]]:
        """Load chunking strategy specifications."""
        return {
            "fixed_size": {"complexity": "low", "quality": "medium"},
            "sentence_based": {"complexity": "medium", "quality": "good"},
            "paragraph_based": {"complexity": "medium", "quality": "good"},
            "semantic_heading_aware": {"complexity": "high", "quality": "excellent"}
        }

    def _get_vector_db_config(self, db: str, req: Requirements, scale: Scale) -> Dict[str, Any]:
        """Get vector database configuration."""
        base_config = {
            "collection_name": "rag_documents",
            "distance_metric": "cosine",
            "index_type": "hnsw"
        }

        if db == "pinecone":
            base_config.update({
                "environment": "us-east1-gcp",
                "replicas": 1 if scale == Scale.SMALL else 2,
                "shards": 1 if scale != Scale.LARGE else 3
            })
        elif db == "qdrant":
            base_config.update({
                "memory_mapping": True,
                "quantization": scale == Scale.LARGE,
                "replication_factor": 1 if scale == Scale.SMALL else 2
            })

        return base_config

    def _get_chunking_pros(self, strategy: str) -> List[str]:
        """Get pros for chunking strategy."""
        pros_map = {
            "semantic_heading_aware": ["Preserves document structure", "High semantic coherence", "Good for technical docs"],
            "paragraph_based": ["Respects natural boundaries", "Good balance", "Readable chunks"],
            "sentence_based": ["Natural language boundaries", "Consistent quality", "Good for general text"],
            "fixed_size": ["Predictable sizes", "Simple implementation", "Consistent processing"],
            "adaptive_chunking": ["Handles mixed content", "Optimizes per document", "Best quality"]
        }
        return pros_map.get(strategy, ["Good general purpose strategy"])

    def _get_chunking_cons(self, strategy: str) -> List[str]:
        """Get cons for chunking strategy."""
        cons_map = {
            "semantic_heading_aware": ["Complex implementation", "May create large chunks", "Document-dependent"],
            "paragraph_based": ["Variable sizes", "May break context", "Document-dependent"],
            "sentence_based": ["May create small chunks", "Sentence detection issues", "Variable sizes"],
            "fixed_size": ["Breaks semantic boundaries", "May split sentences", "Context loss"],
            "adaptive_chunking": ["High complexity", "Slower processing", "Harder to debug"]
        }
        return cons_map.get(strategy, ["May not fit all use cases"])

    def _get_embedding_pros(self, model: str) -> List[str]:
        """Get pros for embedding model."""
        if "openai" in model:
            return ["High quality", "Regular updates", "Good performance"]
        elif "all-mpnet" in model:
            return ["High quality", "Free to use", "Good balance"]
        elif "MiniLM" in model:
            return ["Fast processing", "Small size", "Good for real-time"]
        else:
            return ["Specialized for domain", "Good performance"]

    def _get_embedding_cons(self, model: str) -> List[str]:
        """Get cons for embedding model."""
        if "openai" in model:
            return ["API costs", "Vendor lock-in", "Rate limits"]
        elif "sentence-transformers" in model:
            return ["Self-hosting required", "Model updates needed", "GPU beneficial"]
        else:
            return ["May require fine-tuning", "Domain-specific"]

    def _get_vector_db_pros(self, db: str) -> List[str]:
        """Get pros for vector database."""
        pros_map = {
            "pinecone": ["Fully managed", "Excellent performance", "Auto-scaling"],
            "weaviate": ["Rich features", "GraphQL API", "Multi-modal"],
            "qdrant": ["High performance", "Rust-based", "Good scaling"],
            "chroma": ["Simple setup", "Free", "Good for development"],
            "pgvector": ["SQL integration", "ACID compliance", "Familiar"]
        }
        return pros_map.get(db, ["Good performance"])

    def _get_vector_db_cons(self, db: str) -> List[str]:
        """Get cons for vector database."""
        cons_map = {
            "pinecone": ["Expensive", "Vendor lock-in", "Limited customization"],
            "weaviate": ["Complex setup", "Learning curve", "Resource intensive"],
            "qdrant": ["Self-managed", "Smaller community", "Setup complexity"],
            "chroma": ["Limited scaling", "Not production-ready", "Basic features"],
            "pgvector": ["PostgreSQL knowledge needed", "Less specialized", "Manual optimization"]
        }
        return cons_map.get(db, ["Requires maintenance"])

    def _get_retrieval_pros(self, strategy: str) -> List[str]:
        """Get pros for retrieval strategy."""
        pros_map = {
            "dense": ["Semantic understanding", "Good for paraphrases", "Fast"],
            "sparse": ["Exact matching", "Interpretable", "Good for keywords"],
            "hybrid": ["Best of both", "High accuracy", "Robust"]
        }
        return pros_map.get(strategy, ["Good performance"])

    def _get_retrieval_cons(self, strategy: str) -> List[str]:
        """Get cons for retrieval strategy."""
        cons_map = {
            "dense": ["May miss exact matches", "Embedding dependent", "Less interpretable"],
            "sparse": ["Vocabulary mismatch", "No semantic understanding", "Synonym issues"],
            "hybrid": ["More complex", "Tuning required", "Higher latency"]
        }
        return cons_map.get(strategy, ["May require tuning"])


def load_requirements(file_path: str) -> Requirements:
    """Load requirements from JSON file."""
    with open(file_path, 'r') as f:
        data = json.load(f)

    return Requirements(**data)


def save_design(design: PipelineDesign, output_path: str):
    """Save pipeline design to JSON file."""
    # Convert to dict for JSON serialization
    design_dict = {}

    for field_name in design.__dataclass_fields__:
        value = getattr(design, field_name)
        if isinstance(value, ComponentRecommendation):
            design_dict[field_name] = asdict(value)
        elif value is None:
            design_dict[field_name] = None
        else:
            design_dict[field_name] = value

    with open(output_path, 'w') as f:
        json.dump(design_dict, f, indent=2)


def print_design_summary(design: PipelineDesign):
    """Print human-readable design summary."""
    print("\n" + "="*60)
    print("RAG PIPELINE DESIGN SUMMARY")
    print("="*60)

    print(f"\n💰 Total Monthly Cost: ${design.total_cost:.2f}")

    print(f"\n🔧 Component Recommendations:")
    components = [design.chunking, design.embedding, design.vector_db,
                 design.retrieval, design.reranking, design.evaluation]

    for component in components:
        if component:
            print(f"\n  {component.type.upper()}: {component.name}")
            print(f"    Rationale: {component.rationale}")
            if component.cost_monthly > 0:
                print(f"    Monthly Cost: ${component.cost_monthly:.2f}")

    print(f"\n📊 Architecture Diagram:")
    print(design.architecture_diagram)


def main():
    """Main function with command-line interface."""
    parser = argparse.ArgumentParser(description='Design RAG pipeline based on requirements')
    parser.add_argument('requirements', help='JSON file containing system requirements')
    parser.add_argument('--output', '-o', help='Output file for pipeline design (JSON)')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')

    args = parser.parse_args()

    try:
        # Load requirements
        print("Loading requirements...")
        requirements = load_requirements(args.requirements)

        # Design pipeline
        designer = RAGPipelineDesigner()
        design = designer.design_pipeline(requirements)

        # Save design
        if args.output:
            save_design(design, args.output)
            print(f"Pipeline design saved to {args.output}")

        # Print summary
        print_design_summary(design)

        if args.verbose:
            print(f"\n📋 Configuration Templates:")
            for component_type, config in design.config_templates.items():
                print(f"\n  {component_type.upper()}:")
                print(f"    {json.dumps(config, indent=4)}")

    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0


if __name__ == '__main__':
    exit(main())