#!/usr/bin/env python3 """ Chunking Optimizer - Analyzes document corpus and recommends optimal chunking strategy. This script analyzes a collection of text/markdown documents and evaluates different chunking strategies to recommend the optimal approach for the given corpus. Strategies tested: - Fixed-size chunking (character and token-based) with overlap - Sentence-based chunking - Paragraph-based chunking - Semantic chunking (heading-aware) Metrics measured: - Chunk size distribution (mean, std, min, max) - Semantic coherence (topic continuity heuristic) - Boundary quality (sentence break analysis) No external dependencies - uses only Python standard library. """ import argparse import json import os import re import statistics from collections import Counter, defaultdict from math import log, sqrt from pathlib import Path from typing import Dict, List, Tuple, Optional, Any class DocumentCorpus: """Handles loading and preprocessing of document corpus.""" def __init__(self, directory: str, extensions: List[str] = None): self.directory = Path(directory) self.extensions = extensions or ['.txt', '.md', '.markdown'] self.documents = [] self._load_documents() def _load_documents(self): """Load all text documents from directory.""" if not self.directory.exists(): raise FileNotFoundError(f"Directory not found: {self.directory}") for file_path in self.directory.rglob('*'): if file_path.is_file() and file_path.suffix.lower() in self.extensions: try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() if content.strip(): # Only include non-empty files self.documents.append({ 'path': str(file_path), 'content': content, 'size': len(content) }) except Exception as e: print(f"Warning: Could not read {file_path}: {e}") if not self.documents: raise ValueError(f"No valid documents found in {self.directory}") print(f"Loaded {len(self.documents)} documents totaling {sum(d['size'] for d in self.documents):,} characters") class ChunkingStrategy: """Base class for chunking strategies.""" def __init__(self, name: str, config: Dict[str, Any]): self.name = name self.config = config def chunk(self, text: str) -> List[Dict[str, Any]]: """Split text into chunks. Returns list of chunk dictionaries.""" raise NotImplementedError class FixedSizeChunker(ChunkingStrategy): """Fixed-size chunking with optional overlap.""" def __init__(self, chunk_size: int = 1000, overlap: int = 100, unit: str = 'char'): config = {'chunk_size': chunk_size, 'overlap': overlap, 'unit': unit} super().__init__(f'fixed_size_{unit}', config) self.chunk_size = chunk_size self.overlap = overlap self.unit = unit def chunk(self, text: str) -> List[Dict[str, Any]]: chunks = [] if self.unit == 'char': return self._chunk_by_chars(text) else: # word-based approximation words = text.split() return self._chunk_by_words(words) def _chunk_by_chars(self, text: str) -> List[Dict[str, Any]]: chunks = [] start = 0 chunk_id = 0 while start < len(text): end = min(start + self.chunk_size, len(text)) chunk_text = text[start:end] chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': start, 'end': end, 'size': len(chunk_text) }) start = max(start + self.chunk_size - self.overlap, start + 1) chunk_id += 1 if start >= len(text): break return chunks def _chunk_by_words(self, words: List[str]) -> List[Dict[str, Any]]: chunks = [] start = 0 chunk_id = 0 while start < len(words): end = min(start + self.chunk_size, len(words)) chunk_words = words[start:end] chunk_text = ' '.join(chunk_words) chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': start, 'end': end, 'size': len(chunk_text) }) start = max(start + self.chunk_size - self.overlap, start + 1) chunk_id += 1 if start >= len(words): break return chunks class SentenceChunker(ChunkingStrategy): """Sentence-based chunking.""" def __init__(self, max_size: int = 1000): config = {'max_size': max_size} super().__init__('sentence_based', config) self.max_size = max_size # Simple sentence boundary detection self.sentence_endings = re.compile(r'[.!?]+\s+') def chunk(self, text: str) -> List[Dict[str, Any]]: # Split into sentences sentences = self._split_sentences(text) chunks = [] current_chunk = [] current_size = 0 chunk_id = 0 for sentence in sentences: sentence_size = len(sentence) if current_size + sentence_size > self.max_size and current_chunk: # Save current chunk chunk_text = ' '.join(current_chunk) chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, # Approximate 'end': len(chunk_text), 'size': len(chunk_text), 'sentence_count': len(current_chunk) }) chunk_id += 1 current_chunk = [sentence] current_size = sentence_size else: current_chunk.append(sentence) current_size += sentence_size # Add final chunk if current_chunk: chunk_text = ' '.join(current_chunk) chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, 'end': len(chunk_text), 'size': len(chunk_text), 'sentence_count': len(current_chunk) }) return chunks def _split_sentences(self, text: str) -> List[str]: """Simple sentence splitting.""" sentences = [] parts = self.sentence_endings.split(text) for i, part in enumerate(parts[:-1]): # Add the sentence ending back ending_match = list(self.sentence_endings.finditer(text)) if i < len(ending_match): sentence = part + ending_match[i].group().strip() else: sentence = part if sentence.strip(): sentences.append(sentence.strip()) # Add final part if it exists if parts[-1].strip(): sentences.append(parts[-1].strip()) return [s for s in sentences if len(s.strip()) > 0] class ParagraphChunker(ChunkingStrategy): """Paragraph-based chunking.""" def __init__(self, max_size: int = 2000, min_paragraph_size: int = 50): config = {'max_size': max_size, 'min_paragraph_size': min_paragraph_size} super().__init__('paragraph_based', config) self.max_size = max_size self.min_paragraph_size = min_paragraph_size def chunk(self, text: str) -> List[Dict[str, Any]]: # Split by double newlines (paragraph boundaries) paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()] chunks = [] current_chunk = [] current_size = 0 chunk_id = 0 for paragraph in paragraphs: paragraph_size = len(paragraph) # Skip very short paragraphs unless they're the only content if paragraph_size < self.min_paragraph_size and len(paragraphs) > 1: continue if current_size + paragraph_size > self.max_size and current_chunk: # Save current chunk chunk_text = '\n\n'.join(current_chunk) chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, 'end': len(chunk_text), 'size': len(chunk_text), 'paragraph_count': len(current_chunk) }) chunk_id += 1 current_chunk = [paragraph] current_size = paragraph_size else: current_chunk.append(paragraph) current_size += paragraph_size + 2 # Account for newlines # Add final chunk if current_chunk: chunk_text = '\n\n'.join(current_chunk) chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, 'end': len(chunk_text), 'size': len(chunk_text), 'paragraph_count': len(current_chunk) }) return chunks class SemanticChunker(ChunkingStrategy): """Heading-aware semantic chunking.""" def __init__(self, max_size: int = 1500, heading_weight: float = 2.0): config = {'max_size': max_size, 'heading_weight': heading_weight} super().__init__('semantic_heading', config) self.max_size = max_size self.heading_weight = heading_weight # Markdown and plain text heading patterns self.heading_patterns = [ re.compile(r'^#{1,6}\s+(.+)$', re.MULTILINE), # Markdown headers re.compile(r'^(.+)\n[=-]+\s*$', re.MULTILINE), # Underlined headers re.compile(r'^\d+\.\s*(.+)$', re.MULTILINE), # Numbered sections ] def chunk(self, text: str) -> List[Dict[str, Any]]: sections = self._identify_sections(text) chunks = [] chunk_id = 0 for section in sections: section_chunks = self._chunk_section(section, chunk_id) chunks.extend(section_chunks) chunk_id += len(section_chunks) return chunks def _identify_sections(self, text: str) -> List[Dict[str, Any]]: """Identify sections based on headings.""" sections = [] lines = text.split('\n') current_section = {'heading': 'Introduction', 'content': '', 'level': 0} for line in lines: is_heading = False heading_level = 0 heading_text = line.strip() # Check for markdown headers if line.strip().startswith('#'): level = len(line) - len(line.lstrip('#')) if level <= 6: heading_text = line.strip('#').strip() heading_level = level is_heading = True # Check for underlined headers elif len(sections) > 0 and line.strip() and all(c in '=-' for c in line.strip()): # Previous line might be heading if current_section['content']: content_lines = current_section['content'].strip().split('\n') if content_lines: potential_heading = content_lines[-1].strip() if len(potential_heading) > 0 and len(potential_heading) < 100: # Treat as heading current_section['content'] = '\n'.join(content_lines[:-1]) sections.append(current_section) current_section = { 'heading': potential_heading, 'content': '', 'level': 1 if '=' in line else 2 } continue if is_heading: if current_section['content'].strip(): sections.append(current_section) current_section = { 'heading': heading_text, 'content': '', 'level': heading_level } else: current_section['content'] += line + '\n' # Add final section if current_section['content'].strip(): sections.append(current_section) return sections def _chunk_section(self, section: Dict[str, Any], start_id: int) -> List[Dict[str, Any]]: """Chunk a single section.""" content = section['content'].strip() if not content: return [] heading = section['heading'] chunks = [] # If section is small enough, return as single chunk if len(content) <= self.max_size: chunks.append({ 'id': start_id, 'text': f"{heading}\n\n{content}" if heading else content, 'start': 0, 'end': len(content), 'size': len(content), 'heading': heading, 'level': section['level'] }) return chunks # Split large sections by paragraphs paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] current_chunk = [] current_size = len(heading) + 2 if heading else 0 # Account for heading chunk_id = start_id for paragraph in paragraphs: paragraph_size = len(paragraph) if current_size + paragraph_size > self.max_size and current_chunk: # Save current chunk chunk_text = '\n\n'.join(current_chunk) if heading and chunk_id == start_id: chunk_text = f"{heading}\n\n{chunk_text}" chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, 'end': len(chunk_text), 'size': len(chunk_text), 'heading': heading if chunk_id == start_id else f"{heading} (continued)", 'level': section['level'] }) chunk_id += 1 current_chunk = [paragraph] current_size = paragraph_size else: current_chunk.append(paragraph) current_size += paragraph_size + 2 # Account for newlines # Add final chunk if current_chunk: chunk_text = '\n\n'.join(current_chunk) if heading and chunk_id == start_id: chunk_text = f"{heading}\n\n{chunk_text}" elif heading: chunk_text = f"{heading} (continued)\n\n{chunk_text}" chunks.append({ 'id': chunk_id, 'text': chunk_text, 'start': 0, 'end': len(chunk_text), 'size': len(chunk_text), 'heading': heading if chunk_id == start_id else f"{heading} (continued)", 'level': section['level'] }) return chunks class ChunkAnalyzer: """Analyzes chunks and provides quality metrics.""" def __init__(self): self.vocabulary = set() self.word_freq = Counter() def analyze_chunks(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]: """Comprehensive chunk analysis.""" if not chunks: return {'error': 'No chunks to analyze'} sizes = [chunk['size'] for chunk in chunks] # Basic size statistics size_stats = { 'count': len(chunks), 'mean': statistics.mean(sizes), 'median': statistics.median(sizes), 'std': statistics.stdev(sizes) if len(sizes) > 1 else 0, 'min': min(sizes), 'max': max(sizes), 'total': sum(sizes) } # Boundary quality analysis boundary_quality = self._analyze_boundary_quality(chunks) # Semantic coherence (simple heuristic) coherence_score = self._calculate_semantic_coherence(chunks) # Vocabulary distribution vocab_stats = self._analyze_vocabulary(chunks) return { 'size_statistics': size_stats, 'boundary_quality': boundary_quality, 'semantic_coherence': coherence_score, 'vocabulary_statistics': vocab_stats } def _analyze_boundary_quality(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze how well chunks respect natural boundaries.""" sentence_breaks = 0 word_breaks = 0 total_chunks = len(chunks) sentence_endings = re.compile(r'[.!?]\s*$') for chunk in chunks: text = chunk['text'].strip() if not text: continue # Check if chunk ends with sentence boundary if sentence_endings.search(text): sentence_breaks += 1 # Check if chunk ends with word boundary if text[-1].isalnum() or text[-1] in '.!?': word_breaks += 1 return { 'sentence_boundary_ratio': sentence_breaks / total_chunks if total_chunks > 0 else 0, 'word_boundary_ratio': word_breaks / total_chunks if total_chunks > 0 else 0, 'clean_breaks': sentence_breaks, 'total_chunks': total_chunks } def _calculate_semantic_coherence(self, chunks: List[Dict[str, Any]]) -> float: """Simple semantic coherence heuristic based on vocabulary overlap.""" if len(chunks) < 2: return 1.0 coherence_scores = [] for i in range(len(chunks) - 1): chunk1_words = set(re.findall(r'\b\w+\b', chunks[i]['text'].lower())) chunk2_words = set(re.findall(r'\b\w+\b', chunks[i+1]['text'].lower())) if not chunk1_words or not chunk2_words: continue # Jaccard similarity as coherence measure intersection = len(chunk1_words & chunk2_words) union = len(chunk1_words | chunk2_words) if union > 0: coherence_scores.append(intersection / union) return statistics.mean(coherence_scores) if coherence_scores else 0.0 def _analyze_vocabulary(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]: """Analyze vocabulary distribution across chunks.""" all_words = [] chunk_vocab_sizes = [] for chunk in chunks: words = re.findall(r'\b\w+\b', chunk['text'].lower()) all_words.extend(words) chunk_vocab_sizes.append(len(set(words))) total_vocab = len(set(all_words)) word_freq = Counter(all_words) return { 'total_vocabulary': total_vocab, 'avg_chunk_vocabulary': statistics.mean(chunk_vocab_sizes) if chunk_vocab_sizes else 0, 'vocabulary_diversity': total_vocab / len(all_words) if all_words else 0, 'most_common_words': word_freq.most_common(10) } class ChunkingOptimizer: """Main optimizer that tests different chunking strategies.""" def __init__(self): self.analyzer = ChunkAnalyzer() def optimize(self, corpus: DocumentCorpus, config: Dict[str, Any] = None) -> Dict[str, Any]: """Test all chunking strategies and recommend the best one.""" config = config or {} strategies = self._create_strategies(config) results = {} print(f"Testing {len(strategies)} chunking strategies...") for strategy in strategies: print(f" Testing {strategy.name}...") strategy_results = self._test_strategy(corpus, strategy) results[strategy.name] = strategy_results # Recommend best strategy recommendation = self._recommend_strategy(results) return { 'corpus_info': { 'document_count': len(corpus.documents), 'total_size': sum(d['size'] for d in corpus.documents), 'avg_document_size': statistics.mean([d['size'] for d in corpus.documents]) }, 'strategy_results': results, 'recommendation': recommendation, 'sample_chunks': self._generate_sample_chunks(corpus, recommendation['best_strategy']) } def _create_strategies(self, config: Dict[str, Any]) -> List[ChunkingStrategy]: """Create all chunking strategies to test.""" strategies = [] # Fixed-size strategies for size in config.get('fixed_sizes', [512, 1000, 1500]): for overlap in config.get('overlaps', [50, 100]): strategies.append(FixedSizeChunker(size, overlap, 'char')) # Sentence-based strategies for max_size in config.get('sentence_max_sizes', [800, 1200]): strategies.append(SentenceChunker(max_size)) # Paragraph-based strategies for max_size in config.get('paragraph_max_sizes', [1500, 2000]): strategies.append(ParagraphChunker(max_size)) # Semantic strategies for max_size in config.get('semantic_max_sizes', [1200, 1800]): strategies.append(SemanticChunker(max_size)) return strategies def _test_strategy(self, corpus: DocumentCorpus, strategy: ChunkingStrategy) -> Dict[str, Any]: """Test a single chunking strategy.""" all_chunks = [] document_results = [] for doc in corpus.documents: try: chunks = strategy.chunk(doc['content']) all_chunks.extend(chunks) doc_analysis = self.analyzer.analyze_chunks(chunks) document_results.append({ 'path': doc['path'], 'chunk_count': len(chunks), 'analysis': doc_analysis }) except Exception as e: print(f" Error processing {doc['path']}: {e}") continue # Overall analysis overall_analysis = self.analyzer.analyze_chunks(all_chunks) return { 'strategy_config': strategy.config, 'total_chunks': len(all_chunks), 'overall_analysis': overall_analysis, 'document_results': document_results, 'performance_score': self._calculate_performance_score(overall_analysis) } def _calculate_performance_score(self, analysis: Dict[str, Any]) -> float: """Calculate overall performance score for a strategy.""" if 'error' in analysis: return 0.0 size_stats = analysis['size_statistics'] boundary_quality = analysis['boundary_quality'] coherence = analysis['semantic_coherence'] # Normalize metrics to 0-1 range and combine size_consistency = 1.0 - min(size_stats['std'] / size_stats['mean'], 1.0) if size_stats['mean'] > 0 else 0 boundary_score = (boundary_quality['sentence_boundary_ratio'] + boundary_quality['word_boundary_ratio']) / 2 coherence_score = coherence # Weighted combination return (size_consistency * 0.3 + boundary_score * 0.4 + coherence_score * 0.3) def _recommend_strategy(self, results: Dict[str, Any]) -> Dict[str, Any]: """Recommend the best chunking strategy based on analysis.""" best_strategy = None best_score = 0 strategy_scores = {} for strategy_name, result in results.items(): score = result['performance_score'] strategy_scores[strategy_name] = score if score > best_score: best_score = score best_strategy = strategy_name return { 'best_strategy': best_strategy, 'best_score': best_score, 'all_scores': strategy_scores, 'reasoning': self._generate_reasoning(best_strategy, results[best_strategy] if best_strategy else None) } def _generate_reasoning(self, strategy_name: str, result: Dict[str, Any]) -> str: """Generate human-readable reasoning for the recommendation.""" if not result: return "No valid strategy found." analysis = result['overall_analysis'] size_stats = analysis['size_statistics'] boundary = analysis['boundary_quality'] reasoning = f"Recommended '{strategy_name}' because:\n" reasoning += f"- Average chunk size: {size_stats['mean']:.0f} characters\n" reasoning += f"- Size consistency: {size_stats['std']:.0f} std deviation\n" reasoning += f"- Boundary quality: {boundary['sentence_boundary_ratio']:.2%} clean sentence breaks\n" reasoning += f"- Semantic coherence: {analysis['semantic_coherence']:.3f}\n" return reasoning def _generate_sample_chunks(self, corpus: DocumentCorpus, strategy_name: str) -> List[Dict[str, Any]]: """Generate sample chunks using the recommended strategy.""" if not strategy_name or not corpus.documents: return [] # Create strategy instance strategy = None if 'fixed_size' in strategy_name: strategy = FixedSizeChunker() elif 'sentence' in strategy_name: strategy = SentenceChunker() elif 'paragraph' in strategy_name: strategy = ParagraphChunker() elif 'semantic' in strategy_name: strategy = SemanticChunker() if not strategy: return [] # Get chunks from first document sample_doc = corpus.documents[0] chunks = strategy.chunk(sample_doc['content']) # Return first 3 chunks as samples return chunks[:3] def main(): """Main function with command-line interface.""" parser = argparse.ArgumentParser(description='Analyze documents and recommend optimal chunking strategy') parser.add_argument('directory', help='Directory containing text/markdown documents') parser.add_argument('--output', '-o', help='Output file for results (JSON format)') parser.add_argument('--config', '-c', help='Configuration file (JSON format)') parser.add_argument('--extensions', nargs='+', default=['.txt', '.md', '.markdown'], help='File extensions to process') parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') args = parser.parse_args() # Load configuration config = {} if args.config and os.path.exists(args.config): with open(args.config, 'r') as f: config = json.load(f) try: # Load corpus print(f"Loading documents from {args.directory}...") corpus = DocumentCorpus(args.directory, args.extensions) # Run optimization optimizer = ChunkingOptimizer() results = optimizer.optimize(corpus, config) # Save results if args.output: with open(args.output, 'w') as f: json.dump(results, f, indent=2) print(f"Results saved to {args.output}") # Print summary print("\n" + "="*60) print("CHUNKING OPTIMIZATION RESULTS") print("="*60) corpus_info = results['corpus_info'] print(f"Corpus: {corpus_info['document_count']} documents, {corpus_info['total_size']:,} characters") recommendation = results['recommendation'] print(f"\nRecommended Strategy: {recommendation['best_strategy']}") print(f"Performance Score: {recommendation['best_score']:.3f}") print(f"\nReasoning:\n{recommendation['reasoning']}") if args.verbose: print("\nAll Strategy Scores:") for strategy, score in recommendation['all_scores'].items(): print(f" {strategy}: {score:.3f}") print("\nSample Chunks:") for i, chunk in enumerate(results['sample_chunks'][:2]): print(f"\nChunk {i+1} ({chunk['size']} chars):") print("-" * 40) print(chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text']) except Exception as e: print(f"Error: {e}") return 1 return 0 if __name__ == '__main__': exit(main())