CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/rag-architect/chunking_optimizer.py

#!/usr/bin/env python3
"""
Chunking Optimizer - Analyzes document corpus and recommends optimal chunking strategy.

This script analyzes a collection of text/markdown documents and evaluates different
chunking strategies to recommend the optimal approach for the given corpus.

Strategies tested:
- Fixed-size chunking (character and token-based) with overlap
- Sentence-based chunking
- Paragraph-based chunking
- Semantic chunking (heading-aware)

Metrics measured:
- Chunk size distribution (mean, std, min, max)
- Semantic coherence (topic continuity heuristic)
- Boundary quality (sentence break analysis)

No external dependencies - uses only Python standard library.
"""

import argparse
import json
import os
import re
import statistics
from collections import Counter, defaultdict
from math import log, sqrt
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any


class DocumentCorpus:
    """Handles loading and preprocessing of document corpus."""

    def __init__(self, directory: str, extensions: List[str] = None):
        self.directory = Path(directory)
        self.extensions = extensions or ['.txt', '.md', '.markdown']
        self.documents = []
        self._load_documents()

    def _load_documents(self):
        """Load all text documents from directory."""
        if not self.directory.exists():
            raise FileNotFoundError(f"Directory not found: {self.directory}")

        for file_path in self.directory.rglob('*'):
            if file_path.is_file() and file_path.suffix.lower() in self.extensions:
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                    if content.strip():  # Only include non-empty files
                        self.documents.append({
                            'path': str(file_path),
                            'content': content,
                            'size': len(content)
                        })
                except Exception as e:
                    print(f"Warning: Could not read {file_path}: {e}")

        if not self.documents:
            raise ValueError(f"No valid documents found in {self.directory}")

        print(f"Loaded {len(self.documents)} documents totaling {sum(d['size'] for d in self.documents):,} characters")


class ChunkingStrategy:
    """Base class for chunking strategies."""

    def __init__(self, name: str, config: Dict[str, Any]):
        self.name = name
        self.config = config

    def chunk(self, text: str) -> List[Dict[str, Any]]:
        """Split text into chunks. Returns list of chunk dictionaries."""
        raise NotImplementedError


class FixedSizeChunker(ChunkingStrategy):
    """Fixed-size chunking with optional overlap."""

    def __init__(self, chunk_size: int = 1000, overlap: int = 100, unit: str = 'char'):
        config = {'chunk_size': chunk_size, 'overlap': overlap, 'unit': unit}
        super().__init__(f'fixed_size_{unit}', config)
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.unit = unit

    def chunk(self, text: str) -> List[Dict[str, Any]]:
        chunks = []
        if self.unit == 'char':
            return self._chunk_by_chars(text)
        else:  # word-based approximation
            words = text.split()
            return self._chunk_by_words(words)

    def _chunk_by_chars(self, text: str) -> List[Dict[str, Any]]:
        chunks = []
        start = 0
        chunk_id = 0

        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            chunk_text = text[start:end]

            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'start': start,
                'end': end,
                'size': len(chunk_text)
            })

            start = max(start + self.chunk_size - self.overlap, start + 1)
            chunk_id += 1

            if start >= len(text):
                break

        return chunks

    def _chunk_by_words(self, words: List[str]) -> List[Dict[str, Any]]:
        chunks = []
        start = 0
        chunk_id = 0

        while start < len(words):
            end = min(start + self.chunk_size, len(words))
            chunk_words = words[start:end]
            chunk_text = ' '.join(chunk_words)

            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'start': start,
                'end': end,
                'size': len(chunk_text)
            })

            start = max(start + self.chunk_size - self.overlap, start + 1)
            chunk_id += 1

            if start >= len(words):
                break

        return chunks


class SentenceChunker(ChunkingStrategy):
    """Sentence-based chunking."""

    def __init__(self, max_size: int = 1000):
        config = {'max_size': max_size}
        super().__init__('sentence_based', config)
        self.max_size = max_size
        # Simple sentence boundary detection
        self.sentence_endings = re.compile(r'[.!?]+\s+')

    def chunk(self, text: str) -> List[Dict[str, Any]]:
        # Split into sentences
        sentences = self._split_sentences(text)
        chunks = []
        current_chunk = []
        current_size = 0
        chunk_id = 0

        for sentence in sentences:
            sentence_size = len(sentence)

            if current_size + sentence_size > self.max_size and current_chunk:
                # Save current chunk
                chunk_text = ' '.join(current_chunk)
                chunks.append({
                    'id': chunk_id,
                    'text': chunk_text,
                    'start': 0,  # Approximate
                    'end': len(chunk_text),
                    'size': len(chunk_text),
                    'sentence_count': len(current_chunk)
                })
                chunk_id += 1
                current_chunk = [sentence]
                current_size = sentence_size
            else:
                current_chunk.append(sentence)
                current_size += sentence_size

        # Add final chunk
        if current_chunk:
            chunk_text = ' '.join(current_chunk)
            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'start': 0,
                'end': len(chunk_text),
                'size': len(chunk_text),
                'sentence_count': len(current_chunk)
            })

        return chunks

    def _split_sentences(self, text: str) -> List[str]:
        """Simple sentence splitting."""
        sentences = []
        parts = self.sentence_endings.split(text)

        for i, part in enumerate(parts[:-1]):
            # Add the sentence ending back
            ending_match = list(self.sentence_endings.finditer(text))
            if i < len(ending_match):
                sentence = part + ending_match[i].group().strip()
            else:
                sentence = part

            if sentence.strip():
                sentences.append(sentence.strip())

        # Add final part if it exists
        if parts[-1].strip():
            sentences.append(parts[-1].strip())

        return [s for s in sentences if len(s.strip()) > 0]


class ParagraphChunker(ChunkingStrategy):
    """Paragraph-based chunking."""

    def __init__(self, max_size: int = 2000, min_paragraph_size: int = 50):
        config = {'max_size': max_size, 'min_paragraph_size': min_paragraph_size}
        super().__init__('paragraph_based', config)
        self.max_size = max_size
        self.min_paragraph_size = min_paragraph_size

    def chunk(self, text: str) -> List[Dict[str, Any]]:
        # Split by double newlines (paragraph boundaries)
        paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
        chunks = []
        current_chunk = []
        current_size = 0
        chunk_id = 0

        for paragraph in paragraphs:
            paragraph_size = len(paragraph)

            # Skip very short paragraphs unless they're the only content
            if paragraph_size < self.min_paragraph_size and len(paragraphs) > 1:
                continue

            if current_size + paragraph_size > self.max_size and current_chunk:
                # Save current chunk
                chunk_text = '\n\n'.join(current_chunk)
                chunks.append({
                    'id': chunk_id,
                    'text': chunk_text,
                    'start': 0,
                    'end': len(chunk_text),
                    'size': len(chunk_text),
                    'paragraph_count': len(current_chunk)
                })
                chunk_id += 1
                current_chunk = [paragraph]
                current_size = paragraph_size
            else:
                current_chunk.append(paragraph)
                current_size += paragraph_size + 2  # Account for newlines

        # Add final chunk
        if current_chunk:
            chunk_text = '\n\n'.join(current_chunk)
            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'start': 0,
                'end': len(chunk_text),
                'size': len(chunk_text),
                'paragraph_count': len(current_chunk)
            })

        return chunks


class SemanticChunker(ChunkingStrategy):
    """Heading-aware semantic chunking."""

    def __init__(self, max_size: int = 1500, heading_weight: float = 2.0):
        config = {'max_size': max_size, 'heading_weight': heading_weight}
        super().__init__('semantic_heading', config)
        self.max_size = max_size
        self.heading_weight = heading_weight

        # Markdown and plain text heading patterns
        self.heading_patterns = [
            re.compile(r'^#{1,6}\s+(.+)$', re.MULTILINE),  # Markdown headers
            re.compile(r'^(.+)\n[=-]+\s*$', re.MULTILINE),  # Underlined headers
            re.compile(r'^\d+\.\s*(.+)$', re.MULTILINE),   # Numbered sections
        ]

    def chunk(self, text: str) -> List[Dict[str, Any]]:
        sections = self._identify_sections(text)
        chunks = []
        chunk_id = 0

        for section in sections:
            section_chunks = self._chunk_section(section, chunk_id)
            chunks.extend(section_chunks)
            chunk_id += len(section_chunks)

        return chunks

    def _identify_sections(self, text: str) -> List[Dict[str, Any]]:
        """Identify sections based on headings."""
        sections = []
        lines = text.split('\n')
        current_section = {'heading': 'Introduction', 'content': '', 'level': 0}

        for line in lines:
            is_heading = False
            heading_level = 0
            heading_text = line.strip()

            # Check for markdown headers
            if line.strip().startswith('#'):
                level = len(line) - len(line.lstrip('#'))
                if level <= 6:
                    heading_text = line.strip('#').strip()
                    heading_level = level
                    is_heading = True

            # Check for underlined headers
            elif len(sections) > 0 and line.strip() and all(c in '=-' for c in line.strip()):
                # Previous line might be heading
                if current_section['content']:
                    content_lines = current_section['content'].strip().split('\n')
                    if content_lines:
                        potential_heading = content_lines[-1].strip()
                        if len(potential_heading) > 0 and len(potential_heading) < 100:
                            # Treat as heading
                            current_section['content'] = '\n'.join(content_lines[:-1])
                            sections.append(current_section)
                            current_section = {
                                'heading': potential_heading,
                                'content': '',
                                'level': 1 if '=' in line else 2
                            }
                            continue

            if is_heading:
                if current_section['content'].strip():
                    sections.append(current_section)
                current_section = {
                    'heading': heading_text,
                    'content': '',
                    'level': heading_level
                }
            else:
                current_section['content'] += line + '\n'

        # Add final section
        if current_section['content'].strip():
            sections.append(current_section)

        return sections

    def _chunk_section(self, section: Dict[str, Any], start_id: int) -> List[Dict[str, Any]]:
        """Chunk a single section."""
        content = section['content'].strip()
        if not content:
            return []

        heading = section['heading']
        chunks = []

        # If section is small enough, return as single chunk
        if len(content) <= self.max_size:
            chunks.append({
                'id': start_id,
                'text': f"{heading}\n\n{content}" if heading else content,
                'start': 0,
                'end': len(content),
                'size': len(content),
                'heading': heading,
                'level': section['level']
            })
            return chunks

        # Split large sections by paragraphs
        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
        current_chunk = []
        current_size = len(heading) + 2 if heading else 0  # Account for heading
        chunk_id = start_id

        for paragraph in paragraphs:
            paragraph_size = len(paragraph)

            if current_size + paragraph_size > self.max_size and current_chunk:
                # Save current chunk
                chunk_text = '\n\n'.join(current_chunk)
                if heading and chunk_id == start_id:
                    chunk_text = f"{heading}\n\n{chunk_text}"

                chunks.append({
                    'id': chunk_id,
                    'text': chunk_text,
                    'start': 0,
                    'end': len(chunk_text),
                    'size': len(chunk_text),
                    'heading': heading if chunk_id == start_id else f"{heading} (continued)",
                    'level': section['level']
                })
                chunk_id += 1
                current_chunk = [paragraph]
                current_size = paragraph_size
            else:
                current_chunk.append(paragraph)
                current_size += paragraph_size + 2  # Account for newlines

        # Add final chunk
        if current_chunk:
            chunk_text = '\n\n'.join(current_chunk)
            if heading and chunk_id == start_id:
                chunk_text = f"{heading}\n\n{chunk_text}"
            elif heading:
                chunk_text = f"{heading} (continued)\n\n{chunk_text}"

            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'start': 0,
                'end': len(chunk_text),
                'size': len(chunk_text),
                'heading': heading if chunk_id == start_id else f"{heading} (continued)",
                'level': section['level']
            })

        return chunks


class ChunkAnalyzer:
    """Analyzes chunks and provides quality metrics."""

    def __init__(self):
        self.vocabulary = set()
        self.word_freq = Counter()

    def analyze_chunks(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Comprehensive chunk analysis."""
        if not chunks:
            return {'error': 'No chunks to analyze'}

        sizes = [chunk['size'] for chunk in chunks]

        # Basic size statistics
        size_stats = {
            'count': len(chunks),
            'mean': statistics.mean(sizes),
            'median': statistics.median(sizes),
            'std': statistics.stdev(sizes) if len(sizes) > 1 else 0,
            'min': min(sizes),
            'max': max(sizes),
            'total': sum(sizes)
        }

        # Boundary quality analysis
        boundary_quality = self._analyze_boundary_quality(chunks)

        # Semantic coherence (simple heuristic)
        coherence_score = self._calculate_semantic_coherence(chunks)

        # Vocabulary distribution
        vocab_stats = self._analyze_vocabulary(chunks)

        return {
            'size_statistics': size_stats,
            'boundary_quality': boundary_quality,
            'semantic_coherence': coherence_score,
            'vocabulary_statistics': vocab_stats
        }

    def _analyze_boundary_quality(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze how well chunks respect natural boundaries."""
        sentence_breaks = 0
        word_breaks = 0
        total_chunks = len(chunks)

        sentence_endings = re.compile(r'[.!?]\s*$')

        for chunk in chunks:
            text = chunk['text'].strip()
            if not text:
                continue

            # Check if chunk ends with sentence boundary
            if sentence_endings.search(text):
                sentence_breaks += 1

            # Check if chunk ends with word boundary
            if text[-1].isalnum() or text[-1] in '.!?':
                word_breaks += 1

        return {
            'sentence_boundary_ratio': sentence_breaks / total_chunks if total_chunks > 0 else 0,
            'word_boundary_ratio': word_breaks / total_chunks if total_chunks > 0 else 0,
            'clean_breaks': sentence_breaks,
            'total_chunks': total_chunks
        }

    def _calculate_semantic_coherence(self, chunks: List[Dict[str, Any]]) -> float:
        """Simple semantic coherence heuristic based on vocabulary overlap."""
        if len(chunks) < 2:
            return 1.0

        coherence_scores = []

        for i in range(len(chunks) - 1):
            chunk1_words = set(re.findall(r'\b\w+\b', chunks[i]['text'].lower()))
            chunk2_words = set(re.findall(r'\b\w+\b', chunks[i+1]['text'].lower()))

            if not chunk1_words or not chunk2_words:
                continue

            # Jaccard similarity as coherence measure
            intersection = len(chunk1_words & chunk2_words)
            union = len(chunk1_words | chunk2_words)

            if union > 0:
                coherence_scores.append(intersection / union)

        return statistics.mean(coherence_scores) if coherence_scores else 0.0

    def _analyze_vocabulary(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Analyze vocabulary distribution across chunks."""
        all_words = []
        chunk_vocab_sizes = []

        for chunk in chunks:
            words = re.findall(r'\b\w+\b', chunk['text'].lower())
            all_words.extend(words)
            chunk_vocab_sizes.append(len(set(words)))

        total_vocab = len(set(all_words))
        word_freq = Counter(all_words)

        return {
            'total_vocabulary': total_vocab,
            'avg_chunk_vocabulary': statistics.mean(chunk_vocab_sizes) if chunk_vocab_sizes else 0,
            'vocabulary_diversity': total_vocab / len(all_words) if all_words else 0,
            'most_common_words': word_freq.most_common(10)
        }


class ChunkingOptimizer:
    """Main optimizer that tests different chunking strategies."""

    def __init__(self):
        self.analyzer = ChunkAnalyzer()

    def optimize(self, corpus: DocumentCorpus, config: Dict[str, Any] = None) -> Dict[str, Any]:
        """Test all chunking strategies and recommend the best one."""
        config = config or {}

        strategies = self._create_strategies(config)
        results = {}

        print(f"Testing {len(strategies)} chunking strategies...")

        for strategy in strategies:
            print(f"  Testing {strategy.name}...")
            strategy_results = self._test_strategy(corpus, strategy)
            results[strategy.name] = strategy_results

        # Recommend best strategy
        recommendation = self._recommend_strategy(results)

        return {
            'corpus_info': {
                'document_count': len(corpus.documents),
                'total_size': sum(d['size'] for d in corpus.documents),
                'avg_document_size': statistics.mean([d['size'] for d in corpus.documents])
            },
            'strategy_results': results,
            'recommendation': recommendation,
            'sample_chunks': self._generate_sample_chunks(corpus, recommendation['best_strategy'])
        }

    def _create_strategies(self, config: Dict[str, Any]) -> List[ChunkingStrategy]:
        """Create all chunking strategies to test."""
        strategies = []

        # Fixed-size strategies
        for size in config.get('fixed_sizes', [512, 1000, 1500]):
            for overlap in config.get('overlaps', [50, 100]):
                strategies.append(FixedSizeChunker(size, overlap, 'char'))

        # Sentence-based strategies
        for max_size in config.get('sentence_max_sizes', [800, 1200]):
            strategies.append(SentenceChunker(max_size))

        # Paragraph-based strategies
        for max_size in config.get('paragraph_max_sizes', [1500, 2000]):
            strategies.append(ParagraphChunker(max_size))

        # Semantic strategies
        for max_size in config.get('semantic_max_sizes', [1200, 1800]):
            strategies.append(SemanticChunker(max_size))

        return strategies

    def _test_strategy(self, corpus: DocumentCorpus, strategy: ChunkingStrategy) -> Dict[str, Any]:
        """Test a single chunking strategy."""
        all_chunks = []
        document_results = []

        for doc in corpus.documents:
            try:
                chunks = strategy.chunk(doc['content'])
                all_chunks.extend(chunks)

                doc_analysis = self.analyzer.analyze_chunks(chunks)
                document_results.append({
                    'path': doc['path'],
                    'chunk_count': len(chunks),
                    'analysis': doc_analysis
                })
            except Exception as e:
                print(f"    Error processing {doc['path']}: {e}")
                continue

        # Overall analysis
        overall_analysis = self.analyzer.analyze_chunks(all_chunks)

        return {
            'strategy_config': strategy.config,
            'total_chunks': len(all_chunks),
            'overall_analysis': overall_analysis,
            'document_results': document_results,
            'performance_score': self._calculate_performance_score(overall_analysis)
        }

    def _calculate_performance_score(self, analysis: Dict[str, Any]) -> float:
        """Calculate overall performance score for a strategy."""
        if 'error' in analysis:
            return 0.0

        size_stats = analysis['size_statistics']
        boundary_quality = analysis['boundary_quality']
        coherence = analysis['semantic_coherence']

        # Normalize metrics to 0-1 range and combine
        size_consistency = 1.0 - min(size_stats['std'] / size_stats['mean'], 1.0) if size_stats['mean'] > 0 else 0
        boundary_score = (boundary_quality['sentence_boundary_ratio'] + boundary_quality['word_boundary_ratio']) / 2
        coherence_score = coherence

        # Weighted combination
        return (size_consistency * 0.3 + boundary_score * 0.4 + coherence_score * 0.3)

    def _recommend_strategy(self, results: Dict[str, Any]) -> Dict[str, Any]:
        """Recommend the best chunking strategy based on analysis."""
        best_strategy = None
        best_score = 0

        strategy_scores = {}

        for strategy_name, result in results.items():
            score = result['performance_score']
            strategy_scores[strategy_name] = score

            if score > best_score:
                best_score = score
                best_strategy = strategy_name

        return {
            'best_strategy': best_strategy,
            'best_score': best_score,
            'all_scores': strategy_scores,
            'reasoning': self._generate_reasoning(best_strategy, results[best_strategy] if best_strategy else None)
        }

    def _generate_reasoning(self, strategy_name: str, result: Dict[str, Any]) -> str:
        """Generate human-readable reasoning for the recommendation."""
        if not result:
            return "No valid strategy found."

        analysis = result['overall_analysis']
        size_stats = analysis['size_statistics']
        boundary = analysis['boundary_quality']

        reasoning = f"Recommended '{strategy_name}' because:\n"
        reasoning += f"- Average chunk size: {size_stats['mean']:.0f} characters\n"
        reasoning += f"- Size consistency: {size_stats['std']:.0f} std deviation\n"
        reasoning += f"- Boundary quality: {boundary['sentence_boundary_ratio']:.2%} clean sentence breaks\n"
        reasoning += f"- Semantic coherence: {analysis['semantic_coherence']:.3f}\n"

        return reasoning

    def _generate_sample_chunks(self, corpus: DocumentCorpus, strategy_name: str) -> List[Dict[str, Any]]:
        """Generate sample chunks using the recommended strategy."""
        if not strategy_name or not corpus.documents:
            return []

        # Create strategy instance
        strategy = None
        if 'fixed_size' in strategy_name:
            strategy = FixedSizeChunker()
        elif 'sentence' in strategy_name:
            strategy = SentenceChunker()
        elif 'paragraph' in strategy_name:
            strategy = ParagraphChunker()
        elif 'semantic' in strategy_name:
            strategy = SemanticChunker()

        if not strategy:
            return []

        # Get chunks from first document
        sample_doc = corpus.documents[0]
        chunks = strategy.chunk(sample_doc['content'])

        # Return first 3 chunks as samples
        return chunks[:3]


def main():
    """Main function with command-line interface."""
    parser = argparse.ArgumentParser(description='Analyze documents and recommend optimal chunking strategy')
    parser.add_argument('directory', help='Directory containing text/markdown documents')
    parser.add_argument('--output', '-o', help='Output file for results (JSON format)')
    parser.add_argument('--config', '-c', help='Configuration file (JSON format)')
    parser.add_argument('--extensions', nargs='+', default=['.txt', '.md', '.markdown'],
                       help='File extensions to process')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')

    args = parser.parse_args()

    # Load configuration
    config = {}
    if args.config and os.path.exists(args.config):
        with open(args.config, 'r') as f:
            config = json.load(f)

    try:
        # Load corpus
        print(f"Loading documents from {args.directory}...")
        corpus = DocumentCorpus(args.directory, args.extensions)

        # Run optimization
        optimizer = ChunkingOptimizer()
        results = optimizer.optimize(corpus, config)

        # Save results
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(results, f, indent=2)
            print(f"Results saved to {args.output}")

        # Print summary
        print("\n" + "="*60)
        print("CHUNKING OPTIMIZATION RESULTS")
        print("="*60)

        corpus_info = results['corpus_info']
        print(f"Corpus: {corpus_info['document_count']} documents, {corpus_info['total_size']:,} characters")

        recommendation = results['recommendation']
        print(f"\nRecommended Strategy: {recommendation['best_strategy']}")
        print(f"Performance Score: {recommendation['best_score']:.3f}")
        print(f"\nReasoning:\n{recommendation['reasoning']}")

        if args.verbose:
            print("\nAll Strategy Scores:")
            for strategy, score in recommendation['all_scores'].items():
                print(f"  {strategy}: {score:.3f}")

        print("\nSample Chunks:")
        for i, chunk in enumerate(results['sample_chunks'][:2]):
            print(f"\nChunk {i+1} ({chunk['size']} chars):")
            print("-" * 40)
            print(chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text'])

    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0


if __name__ == '__main__':
    exit(main())