Files
CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/rag-architect/chunking_optimizer.py
2026-03-12 15:17:52 +07:00

787 lines
29 KiB
Python

#!/usr/bin/env python3
"""
Chunking Optimizer - Analyzes document corpus and recommends optimal chunking strategy.
This script analyzes a collection of text/markdown documents and evaluates different
chunking strategies to recommend the optimal approach for the given corpus.
Strategies tested:
- Fixed-size chunking (character and token-based) with overlap
- Sentence-based chunking
- Paragraph-based chunking
- Semantic chunking (heading-aware)
Metrics measured:
- Chunk size distribution (mean, std, min, max)
- Semantic coherence (topic continuity heuristic)
- Boundary quality (sentence break analysis)
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import os
import re
import statistics
from collections import Counter, defaultdict
from math import log, sqrt
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
class DocumentCorpus:
"""Handles loading and preprocessing of document corpus."""
def __init__(self, directory: str, extensions: List[str] = None):
self.directory = Path(directory)
self.extensions = extensions or ['.txt', '.md', '.markdown']
self.documents = []
self._load_documents()
def _load_documents(self):
"""Load all text documents from directory."""
if not self.directory.exists():
raise FileNotFoundError(f"Directory not found: {self.directory}")
for file_path in self.directory.rglob('*'):
if file_path.is_file() and file_path.suffix.lower() in self.extensions:
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
if content.strip(): # Only include non-empty files
self.documents.append({
'path': str(file_path),
'content': content,
'size': len(content)
})
except Exception as e:
print(f"Warning: Could not read {file_path}: {e}")
if not self.documents:
raise ValueError(f"No valid documents found in {self.directory}")
print(f"Loaded {len(self.documents)} documents totaling {sum(d['size'] for d in self.documents):,} characters")
class ChunkingStrategy:
"""Base class for chunking strategies."""
def __init__(self, name: str, config: Dict[str, Any]):
self.name = name
self.config = config
def chunk(self, text: str) -> List[Dict[str, Any]]:
"""Split text into chunks. Returns list of chunk dictionaries."""
raise NotImplementedError
class FixedSizeChunker(ChunkingStrategy):
"""Fixed-size chunking with optional overlap."""
def __init__(self, chunk_size: int = 1000, overlap: int = 100, unit: str = 'char'):
config = {'chunk_size': chunk_size, 'overlap': overlap, 'unit': unit}
super().__init__(f'fixed_size_{unit}', config)
self.chunk_size = chunk_size
self.overlap = overlap
self.unit = unit
def chunk(self, text: str) -> List[Dict[str, Any]]:
chunks = []
if self.unit == 'char':
return self._chunk_by_chars(text)
else: # word-based approximation
words = text.split()
return self._chunk_by_words(words)
def _chunk_by_chars(self, text: str) -> List[Dict[str, Any]]:
chunks = []
start = 0
chunk_id = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
chunk_text = text[start:end]
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': start,
'end': end,
'size': len(chunk_text)
})
start = max(start + self.chunk_size - self.overlap, start + 1)
chunk_id += 1
if start >= len(text):
break
return chunks
def _chunk_by_words(self, words: List[str]) -> List[Dict[str, Any]]:
chunks = []
start = 0
chunk_id = 0
while start < len(words):
end = min(start + self.chunk_size, len(words))
chunk_words = words[start:end]
chunk_text = ' '.join(chunk_words)
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': start,
'end': end,
'size': len(chunk_text)
})
start = max(start + self.chunk_size - self.overlap, start + 1)
chunk_id += 1
if start >= len(words):
break
return chunks
class SentenceChunker(ChunkingStrategy):
"""Sentence-based chunking."""
def __init__(self, max_size: int = 1000):
config = {'max_size': max_size}
super().__init__('sentence_based', config)
self.max_size = max_size
# Simple sentence boundary detection
self.sentence_endings = re.compile(r'[.!?]+\s+')
def chunk(self, text: str) -> List[Dict[str, Any]]:
# Split into sentences
sentences = self._split_sentences(text)
chunks = []
current_chunk = []
current_size = 0
chunk_id = 0
for sentence in sentences:
sentence_size = len(sentence)
if current_size + sentence_size > self.max_size and current_chunk:
# Save current chunk
chunk_text = ' '.join(current_chunk)
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0, # Approximate
'end': len(chunk_text),
'size': len(chunk_text),
'sentence_count': len(current_chunk)
})
chunk_id += 1
current_chunk = [sentence]
current_size = sentence_size
else:
current_chunk.append(sentence)
current_size += sentence_size
# Add final chunk
if current_chunk:
chunk_text = ' '.join(current_chunk)
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0,
'end': len(chunk_text),
'size': len(chunk_text),
'sentence_count': len(current_chunk)
})
return chunks
def _split_sentences(self, text: str) -> List[str]:
"""Simple sentence splitting."""
sentences = []
parts = self.sentence_endings.split(text)
for i, part in enumerate(parts[:-1]):
# Add the sentence ending back
ending_match = list(self.sentence_endings.finditer(text))
if i < len(ending_match):
sentence = part + ending_match[i].group().strip()
else:
sentence = part
if sentence.strip():
sentences.append(sentence.strip())
# Add final part if it exists
if parts[-1].strip():
sentences.append(parts[-1].strip())
return [s for s in sentences if len(s.strip()) > 0]
class ParagraphChunker(ChunkingStrategy):
"""Paragraph-based chunking."""
def __init__(self, max_size: int = 2000, min_paragraph_size: int = 50):
config = {'max_size': max_size, 'min_paragraph_size': min_paragraph_size}
super().__init__('paragraph_based', config)
self.max_size = max_size
self.min_paragraph_size = min_paragraph_size
def chunk(self, text: str) -> List[Dict[str, Any]]:
# Split by double newlines (paragraph boundaries)
paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
chunks = []
current_chunk = []
current_size = 0
chunk_id = 0
for paragraph in paragraphs:
paragraph_size = len(paragraph)
# Skip very short paragraphs unless they're the only content
if paragraph_size < self.min_paragraph_size and len(paragraphs) > 1:
continue
if current_size + paragraph_size > self.max_size and current_chunk:
# Save current chunk
chunk_text = '\n\n'.join(current_chunk)
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0,
'end': len(chunk_text),
'size': len(chunk_text),
'paragraph_count': len(current_chunk)
})
chunk_id += 1
current_chunk = [paragraph]
current_size = paragraph_size
else:
current_chunk.append(paragraph)
current_size += paragraph_size + 2 # Account for newlines
# Add final chunk
if current_chunk:
chunk_text = '\n\n'.join(current_chunk)
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0,
'end': len(chunk_text),
'size': len(chunk_text),
'paragraph_count': len(current_chunk)
})
return chunks
class SemanticChunker(ChunkingStrategy):
"""Heading-aware semantic chunking."""
def __init__(self, max_size: int = 1500, heading_weight: float = 2.0):
config = {'max_size': max_size, 'heading_weight': heading_weight}
super().__init__('semantic_heading', config)
self.max_size = max_size
self.heading_weight = heading_weight
# Markdown and plain text heading patterns
self.heading_patterns = [
re.compile(r'^#{1,6}\s+(.+)$', re.MULTILINE), # Markdown headers
re.compile(r'^(.+)\n[=-]+\s*$', re.MULTILINE), # Underlined headers
re.compile(r'^\d+\.\s*(.+)$', re.MULTILINE), # Numbered sections
]
def chunk(self, text: str) -> List[Dict[str, Any]]:
sections = self._identify_sections(text)
chunks = []
chunk_id = 0
for section in sections:
section_chunks = self._chunk_section(section, chunk_id)
chunks.extend(section_chunks)
chunk_id += len(section_chunks)
return chunks
def _identify_sections(self, text: str) -> List[Dict[str, Any]]:
"""Identify sections based on headings."""
sections = []
lines = text.split('\n')
current_section = {'heading': 'Introduction', 'content': '', 'level': 0}
for line in lines:
is_heading = False
heading_level = 0
heading_text = line.strip()
# Check for markdown headers
if line.strip().startswith('#'):
level = len(line) - len(line.lstrip('#'))
if level <= 6:
heading_text = line.strip('#').strip()
heading_level = level
is_heading = True
# Check for underlined headers
elif len(sections) > 0 and line.strip() and all(c in '=-' for c in line.strip()):
# Previous line might be heading
if current_section['content']:
content_lines = current_section['content'].strip().split('\n')
if content_lines:
potential_heading = content_lines[-1].strip()
if len(potential_heading) > 0 and len(potential_heading) < 100:
# Treat as heading
current_section['content'] = '\n'.join(content_lines[:-1])
sections.append(current_section)
current_section = {
'heading': potential_heading,
'content': '',
'level': 1 if '=' in line else 2
}
continue
if is_heading:
if current_section['content'].strip():
sections.append(current_section)
current_section = {
'heading': heading_text,
'content': '',
'level': heading_level
}
else:
current_section['content'] += line + '\n'
# Add final section
if current_section['content'].strip():
sections.append(current_section)
return sections
def _chunk_section(self, section: Dict[str, Any], start_id: int) -> List[Dict[str, Any]]:
"""Chunk a single section."""
content = section['content'].strip()
if not content:
return []
heading = section['heading']
chunks = []
# If section is small enough, return as single chunk
if len(content) <= self.max_size:
chunks.append({
'id': start_id,
'text': f"{heading}\n\n{content}" if heading else content,
'start': 0,
'end': len(content),
'size': len(content),
'heading': heading,
'level': section['level']
})
return chunks
# Split large sections by paragraphs
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
current_chunk = []
current_size = len(heading) + 2 if heading else 0 # Account for heading
chunk_id = start_id
for paragraph in paragraphs:
paragraph_size = len(paragraph)
if current_size + paragraph_size > self.max_size and current_chunk:
# Save current chunk
chunk_text = '\n\n'.join(current_chunk)
if heading and chunk_id == start_id:
chunk_text = f"{heading}\n\n{chunk_text}"
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0,
'end': len(chunk_text),
'size': len(chunk_text),
'heading': heading if chunk_id == start_id else f"{heading} (continued)",
'level': section['level']
})
chunk_id += 1
current_chunk = [paragraph]
current_size = paragraph_size
else:
current_chunk.append(paragraph)
current_size += paragraph_size + 2 # Account for newlines
# Add final chunk
if current_chunk:
chunk_text = '\n\n'.join(current_chunk)
if heading and chunk_id == start_id:
chunk_text = f"{heading}\n\n{chunk_text}"
elif heading:
chunk_text = f"{heading} (continued)\n\n{chunk_text}"
chunks.append({
'id': chunk_id,
'text': chunk_text,
'start': 0,
'end': len(chunk_text),
'size': len(chunk_text),
'heading': heading if chunk_id == start_id else f"{heading} (continued)",
'level': section['level']
})
return chunks
class ChunkAnalyzer:
"""Analyzes chunks and provides quality metrics."""
def __init__(self):
self.vocabulary = set()
self.word_freq = Counter()
def analyze_chunks(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Comprehensive chunk analysis."""
if not chunks:
return {'error': 'No chunks to analyze'}
sizes = [chunk['size'] for chunk in chunks]
# Basic size statistics
size_stats = {
'count': len(chunks),
'mean': statistics.mean(sizes),
'median': statistics.median(sizes),
'std': statistics.stdev(sizes) if len(sizes) > 1 else 0,
'min': min(sizes),
'max': max(sizes),
'total': sum(sizes)
}
# Boundary quality analysis
boundary_quality = self._analyze_boundary_quality(chunks)
# Semantic coherence (simple heuristic)
coherence_score = self._calculate_semantic_coherence(chunks)
# Vocabulary distribution
vocab_stats = self._analyze_vocabulary(chunks)
return {
'size_statistics': size_stats,
'boundary_quality': boundary_quality,
'semantic_coherence': coherence_score,
'vocabulary_statistics': vocab_stats
}
def _analyze_boundary_quality(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze how well chunks respect natural boundaries."""
sentence_breaks = 0
word_breaks = 0
total_chunks = len(chunks)
sentence_endings = re.compile(r'[.!?]\s*$')
for chunk in chunks:
text = chunk['text'].strip()
if not text:
continue
# Check if chunk ends with sentence boundary
if sentence_endings.search(text):
sentence_breaks += 1
# Check if chunk ends with word boundary
if text[-1].isalnum() or text[-1] in '.!?':
word_breaks += 1
return {
'sentence_boundary_ratio': sentence_breaks / total_chunks if total_chunks > 0 else 0,
'word_boundary_ratio': word_breaks / total_chunks if total_chunks > 0 else 0,
'clean_breaks': sentence_breaks,
'total_chunks': total_chunks
}
def _calculate_semantic_coherence(self, chunks: List[Dict[str, Any]]) -> float:
"""Simple semantic coherence heuristic based on vocabulary overlap."""
if len(chunks) < 2:
return 1.0
coherence_scores = []
for i in range(len(chunks) - 1):
chunk1_words = set(re.findall(r'\b\w+\b', chunks[i]['text'].lower()))
chunk2_words = set(re.findall(r'\b\w+\b', chunks[i+1]['text'].lower()))
if not chunk1_words or not chunk2_words:
continue
# Jaccard similarity as coherence measure
intersection = len(chunk1_words & chunk2_words)
union = len(chunk1_words | chunk2_words)
if union > 0:
coherence_scores.append(intersection / union)
return statistics.mean(coherence_scores) if coherence_scores else 0.0
def _analyze_vocabulary(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze vocabulary distribution across chunks."""
all_words = []
chunk_vocab_sizes = []
for chunk in chunks:
words = re.findall(r'\b\w+\b', chunk['text'].lower())
all_words.extend(words)
chunk_vocab_sizes.append(len(set(words)))
total_vocab = len(set(all_words))
word_freq = Counter(all_words)
return {
'total_vocabulary': total_vocab,
'avg_chunk_vocabulary': statistics.mean(chunk_vocab_sizes) if chunk_vocab_sizes else 0,
'vocabulary_diversity': total_vocab / len(all_words) if all_words else 0,
'most_common_words': word_freq.most_common(10)
}
class ChunkingOptimizer:
"""Main optimizer that tests different chunking strategies."""
def __init__(self):
self.analyzer = ChunkAnalyzer()
def optimize(self, corpus: DocumentCorpus, config: Dict[str, Any] = None) -> Dict[str, Any]:
"""Test all chunking strategies and recommend the best one."""
config = config or {}
strategies = self._create_strategies(config)
results = {}
print(f"Testing {len(strategies)} chunking strategies...")
for strategy in strategies:
print(f" Testing {strategy.name}...")
strategy_results = self._test_strategy(corpus, strategy)
results[strategy.name] = strategy_results
# Recommend best strategy
recommendation = self._recommend_strategy(results)
return {
'corpus_info': {
'document_count': len(corpus.documents),
'total_size': sum(d['size'] for d in corpus.documents),
'avg_document_size': statistics.mean([d['size'] for d in corpus.documents])
},
'strategy_results': results,
'recommendation': recommendation,
'sample_chunks': self._generate_sample_chunks(corpus, recommendation['best_strategy'])
}
def _create_strategies(self, config: Dict[str, Any]) -> List[ChunkingStrategy]:
"""Create all chunking strategies to test."""
strategies = []
# Fixed-size strategies
for size in config.get('fixed_sizes', [512, 1000, 1500]):
for overlap in config.get('overlaps', [50, 100]):
strategies.append(FixedSizeChunker(size, overlap, 'char'))
# Sentence-based strategies
for max_size in config.get('sentence_max_sizes', [800, 1200]):
strategies.append(SentenceChunker(max_size))
# Paragraph-based strategies
for max_size in config.get('paragraph_max_sizes', [1500, 2000]):
strategies.append(ParagraphChunker(max_size))
# Semantic strategies
for max_size in config.get('semantic_max_sizes', [1200, 1800]):
strategies.append(SemanticChunker(max_size))
return strategies
def _test_strategy(self, corpus: DocumentCorpus, strategy: ChunkingStrategy) -> Dict[str, Any]:
"""Test a single chunking strategy."""
all_chunks = []
document_results = []
for doc in corpus.documents:
try:
chunks = strategy.chunk(doc['content'])
all_chunks.extend(chunks)
doc_analysis = self.analyzer.analyze_chunks(chunks)
document_results.append({
'path': doc['path'],
'chunk_count': len(chunks),
'analysis': doc_analysis
})
except Exception as e:
print(f" Error processing {doc['path']}: {e}")
continue
# Overall analysis
overall_analysis = self.analyzer.analyze_chunks(all_chunks)
return {
'strategy_config': strategy.config,
'total_chunks': len(all_chunks),
'overall_analysis': overall_analysis,
'document_results': document_results,
'performance_score': self._calculate_performance_score(overall_analysis)
}
def _calculate_performance_score(self, analysis: Dict[str, Any]) -> float:
"""Calculate overall performance score for a strategy."""
if 'error' in analysis:
return 0.0
size_stats = analysis['size_statistics']
boundary_quality = analysis['boundary_quality']
coherence = analysis['semantic_coherence']
# Normalize metrics to 0-1 range and combine
size_consistency = 1.0 - min(size_stats['std'] / size_stats['mean'], 1.0) if size_stats['mean'] > 0 else 0
boundary_score = (boundary_quality['sentence_boundary_ratio'] + boundary_quality['word_boundary_ratio']) / 2
coherence_score = coherence
# Weighted combination
return (size_consistency * 0.3 + boundary_score * 0.4 + coherence_score * 0.3)
def _recommend_strategy(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""Recommend the best chunking strategy based on analysis."""
best_strategy = None
best_score = 0
strategy_scores = {}
for strategy_name, result in results.items():
score = result['performance_score']
strategy_scores[strategy_name] = score
if score > best_score:
best_score = score
best_strategy = strategy_name
return {
'best_strategy': best_strategy,
'best_score': best_score,
'all_scores': strategy_scores,
'reasoning': self._generate_reasoning(best_strategy, results[best_strategy] if best_strategy else None)
}
def _generate_reasoning(self, strategy_name: str, result: Dict[str, Any]) -> str:
"""Generate human-readable reasoning for the recommendation."""
if not result:
return "No valid strategy found."
analysis = result['overall_analysis']
size_stats = analysis['size_statistics']
boundary = analysis['boundary_quality']
reasoning = f"Recommended '{strategy_name}' because:\n"
reasoning += f"- Average chunk size: {size_stats['mean']:.0f} characters\n"
reasoning += f"- Size consistency: {size_stats['std']:.0f} std deviation\n"
reasoning += f"- Boundary quality: {boundary['sentence_boundary_ratio']:.2%} clean sentence breaks\n"
reasoning += f"- Semantic coherence: {analysis['semantic_coherence']:.3f}\n"
return reasoning
def _generate_sample_chunks(self, corpus: DocumentCorpus, strategy_name: str) -> List[Dict[str, Any]]:
"""Generate sample chunks using the recommended strategy."""
if not strategy_name or not corpus.documents:
return []
# Create strategy instance
strategy = None
if 'fixed_size' in strategy_name:
strategy = FixedSizeChunker()
elif 'sentence' in strategy_name:
strategy = SentenceChunker()
elif 'paragraph' in strategy_name:
strategy = ParagraphChunker()
elif 'semantic' in strategy_name:
strategy = SemanticChunker()
if not strategy:
return []
# Get chunks from first document
sample_doc = corpus.documents[0]
chunks = strategy.chunk(sample_doc['content'])
# Return first 3 chunks as samples
return chunks[:3]
def main():
"""Main function with command-line interface."""
parser = argparse.ArgumentParser(description='Analyze documents and recommend optimal chunking strategy')
parser.add_argument('directory', help='Directory containing text/markdown documents')
parser.add_argument('--output', '-o', help='Output file for results (JSON format)')
parser.add_argument('--config', '-c', help='Configuration file (JSON format)')
parser.add_argument('--extensions', nargs='+', default=['.txt', '.md', '.markdown'],
help='File extensions to process')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
args = parser.parse_args()
# Load configuration
config = {}
if args.config and os.path.exists(args.config):
with open(args.config, 'r') as f:
config = json.load(f)
try:
# Load corpus
print(f"Loading documents from {args.directory}...")
corpus = DocumentCorpus(args.directory, args.extensions)
# Run optimization
optimizer = ChunkingOptimizer()
results = optimizer.optimize(corpus, config)
# Save results
if args.output:
with open(args.output, 'w') as f:
json.dump(results, f, indent=2)
print(f"Results saved to {args.output}")
# Print summary
print("\n" + "="*60)
print("CHUNKING OPTIMIZATION RESULTS")
print("="*60)
corpus_info = results['corpus_info']
print(f"Corpus: {corpus_info['document_count']} documents, {corpus_info['total_size']:,} characters")
recommendation = results['recommendation']
print(f"\nRecommended Strategy: {recommendation['best_strategy']}")
print(f"Performance Score: {recommendation['best_score']:.3f}")
print(f"\nReasoning:\n{recommendation['reasoning']}")
if args.verbose:
print("\nAll Strategy Scores:")
for strategy, score in recommendation['all_scores'].items():
print(f" {strategy}: {score:.3f}")
print("\nSample Chunks:")
for i, chunk in enumerate(results['sample_chunks'][:2]):
print(f"\nChunk {i+1} ({chunk['size']} chars):")
print("-" * 40)
print(chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text'])
except Exception as e:
print(f"Error: {e}")
return 1
return 0
if __name__ == '__main__':
exit(main())