add brain

This commit is contained in:
2026-03-12 15:17:52 +07:00
parent fd9f558fa1
commit e7821a7a9d
355 changed files with 93784 additions and 24 deletions

View File

@@ -0,0 +1,684 @@
#!/usr/bin/env python3
"""
Tech Debt Scanner
Scans a codebase directory for tech debt signals using AST parsing (Python) and
regex patterns (any language). Detects various forms of technical debt and generates
both JSON inventory and human-readable reports.
Usage:
python debt_scanner.py /path/to/codebase
python debt_scanner.py /path/to/codebase --config config.json
python debt_scanner.py /path/to/codebase --output report.json --format both
"""
import ast
import json
import argparse
import os
import re
import sys
from collections import defaultdict, Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set, Tuple
class DebtScanner:
"""Main scanner class for detecting technical debt in codebases."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = self._load_default_config()
if config:
self.config.update(config)
self.debt_items = []
self.stats = defaultdict(int)
self.file_stats = {}
# Compile regex patterns for performance
self._compile_patterns()
def _load_default_config(self) -> Dict[str, Any]:
"""Load default configuration for debt detection."""
return {
"max_function_length": 50,
"max_complexity": 10,
"max_nesting_depth": 4,
"max_file_size_lines": 500,
"min_duplicate_lines": 3,
"ignore_patterns": [
"*.pyc", "__pycache__", ".git", ".svn", "node_modules",
"build", "dist", "*.min.js", "*.map"
],
"file_extensions": {
"python": [".py"],
"javascript": [".js", ".jsx", ".ts", ".tsx"],
"java": [".java"],
"csharp": [".cs"],
"cpp": [".cpp", ".cc", ".cxx", ".c", ".h", ".hpp"],
"ruby": [".rb"],
"php": [".php"],
"go": [".go"],
"rust": [".rs"],
"kotlin": [".kt"]
},
"comment_patterns": {
"todo": r"(?i)(TODO|FIXME|HACK|XXX|BUG)[\s:]*(.+)",
"commented_code": r"^\s*#.*[=(){}\[\];].*",
"magic_numbers": r"\b\d{2,}\b",
"long_strings": r'["\'](.{100,})["\']'
},
"severity_weights": {
"critical": 10,
"high": 7,
"medium": 5,
"low": 2,
"info": 1
}
}
def _compile_patterns(self):
"""Compile regex patterns for better performance."""
self.comment_regexes = {}
for name, pattern in self.config["comment_patterns"].items():
self.comment_regexes[name] = re.compile(pattern)
# Common code smells patterns
self.smell_patterns = {
"empty_catch": re.compile(r"except[^:]*:\s*pass\s*$", re.MULTILINE),
"print_debug": re.compile(r"print\s*\([^)]*debug[^)]*\)", re.IGNORECASE),
"hardcoded_paths": re.compile(r'["\'][/\\][^"\']*[/\\][^"\']*["\']'),
"sql_injection_risk": re.compile(r'["\'].*%s.*["\'].*execute', re.IGNORECASE),
}
def scan_directory(self, directory: str) -> Dict[str, Any]:
"""
Scan a directory for tech debt.
Args:
directory: Path to the directory to scan
Returns:
Dictionary containing debt inventory and statistics
"""
directory_path = Path(directory)
if not directory_path.exists():
raise ValueError(f"Directory does not exist: {directory}")
print(f"Scanning directory: {directory}")
print("=" * 50)
# Reset state
self.debt_items = []
self.stats = defaultdict(int)
self.file_stats = {}
# Walk through directory
for root, dirs, files in os.walk(directory):
# Filter out ignored directories
dirs[:] = [d for d in dirs if not self._should_ignore(d)]
for file in files:
if self._should_ignore(file):
continue
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, directory)
try:
self._scan_file(file_path, relative_path)
except Exception as e:
print(f"Error scanning {relative_path}: {e}")
self.stats["scan_errors"] += 1
# Post-process results
self._detect_duplicates(directory)
self._calculate_priorities()
return self._generate_report(directory)
def _should_ignore(self, name: str) -> bool:
"""Check if file/directory should be ignored."""
for pattern in self.config["ignore_patterns"]:
if "*" in pattern:
if re.match(pattern.replace("*", ".*"), name):
return True
elif pattern in name:
return True
return False
def _scan_file(self, file_path: str, relative_path: str):
"""Scan a single file for tech debt."""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.splitlines()
except Exception as e:
print(f"Cannot read {relative_path}: {e}")
return
file_ext = Path(file_path).suffix.lower()
file_info = {
"path": relative_path,
"lines": len(lines),
"size_kb": os.path.getsize(file_path) / 1024,
"language": self._detect_language(file_ext),
"debt_count": 0
}
self.stats["files_scanned"] += 1
self.stats["total_lines"] += len(lines)
# File size debt
if len(lines) > self.config["max_file_size_lines"]:
self._add_debt_item(
"large_file",
f"File is too large: {len(lines)} lines",
relative_path,
"medium",
{"lines": len(lines), "recommended_max": self.config["max_file_size_lines"]}
)
file_info["debt_count"] += 1
# Language-specific analysis
if file_info["language"] == "python" and file_ext == ".py":
self._scan_python_file(relative_path, content, lines)
else:
self._scan_generic_file(relative_path, content, lines, file_info["language"])
# Common patterns for all languages
self._scan_common_patterns(relative_path, content, lines)
self.file_stats[relative_path] = file_info
def _detect_language(self, file_ext: str) -> str:
"""Detect programming language from file extension."""
for lang, extensions in self.config["file_extensions"].items():
if file_ext in extensions:
return lang
return "unknown"
def _scan_python_file(self, file_path: str, content: str, lines: List[str]):
"""Scan Python files using AST parsing."""
try:
tree = ast.parse(content)
analyzer = PythonASTAnalyzer(self.config)
debt_items = analyzer.analyze(tree, file_path, lines)
self.debt_items.extend(debt_items)
self.stats["python_files"] += 1
except SyntaxError as e:
self._add_debt_item(
"syntax_error",
f"Python syntax error: {e}",
file_path,
"high",
{"line": e.lineno, "error": str(e)}
)
def _scan_generic_file(self, file_path: str, content: str, lines: List[str], language: str):
"""Scan non-Python files using pattern matching."""
# Detect long lines
for i, line in enumerate(lines):
if len(line) > 120:
self._add_debt_item(
"long_line",
f"Line too long: {len(line)} characters",
file_path,
"low",
{"line_number": i + 1, "length": len(line)}
)
# Detect deep nesting (approximate)
for i, line in enumerate(lines):
indent_level = len(line) - len(line.lstrip())
if language in ["python"]:
indent_level = indent_level // 4 # Python uses 4-space indents
elif language in ["javascript", "java", "csharp", "cpp"]:
# Count braces for brace-based languages
brace_level = content[:content.find('\n'.join(lines[:i+1]))].count('{') - content[:content.find('\n'.join(lines[:i+1]))].count('}')
if brace_level > self.config["max_nesting_depth"]:
self._add_debt_item(
"deep_nesting",
f"Deep nesting detected: {brace_level} levels",
file_path,
"medium",
{"line_number": i + 1, "nesting_level": brace_level}
)
def _scan_common_patterns(self, file_path: str, content: str, lines: List[str]):
"""Scan for common patterns across all file types."""
# TODO/FIXME comments
for i, line in enumerate(lines):
for pattern_name, regex in self.comment_regexes.items():
match = regex.search(line)
if match:
if pattern_name == "todo":
self._add_debt_item(
"todo_comment",
f"TODO/FIXME comment: {match.group(0)}",
file_path,
"low",
{"line_number": i + 1, "comment": match.group(0).strip()}
)
# Code smells
for smell_name, pattern in self.smell_patterns.items():
matches = pattern.finditer(content)
for match in matches:
line_num = content[:match.start()].count('\n') + 1
self._add_debt_item(
smell_name,
f"Code smell detected: {smell_name}",
file_path,
"medium",
{"line_number": line_num, "pattern": match.group(0)[:100]}
)
def _detect_duplicates(self, directory: str):
"""Detect duplicate code blocks across files."""
# Simple duplicate detection based on exact line matches
line_hashes = defaultdict(list)
for file_path, file_info in self.file_stats.items():
try:
full_path = os.path.join(directory, file_path)
with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
for i in range(len(lines) - self.config["min_duplicate_lines"] + 1):
block = ''.join(lines[i:i + self.config["min_duplicate_lines"]])
block_hash = hash(block.strip())
if len(block.strip()) > 50: # Only consider substantial blocks
line_hashes[block_hash].append((file_path, i + 1, block))
except Exception:
continue
# Report duplicates
for block_hash, occurrences in line_hashes.items():
if len(occurrences) > 1:
for file_path, line_num, block in occurrences:
self._add_debt_item(
"duplicate_code",
f"Duplicate code block found in {len(occurrences)} files",
file_path,
"medium",
{
"line_number": line_num,
"duplicate_count": len(occurrences),
"other_files": [f[0] for f in occurrences if f[0] != file_path]
}
)
def _calculate_priorities(self):
"""Calculate priority scores for debt items."""
severity_weights = self.config["severity_weights"]
for item in self.debt_items:
base_score = severity_weights.get(item["severity"], 1)
# Adjust based on debt type
type_multipliers = {
"syntax_error": 2.0,
"security_risk": 1.8,
"large_function": 1.5,
"high_complexity": 1.4,
"duplicate_code": 1.3,
"todo_comment": 0.5
}
multiplier = type_multipliers.get(item["type"], 1.0)
item["priority_score"] = int(base_score * multiplier)
# Set priority category
if item["priority_score"] >= 15:
item["priority"] = "critical"
elif item["priority_score"] >= 10:
item["priority"] = "high"
elif item["priority_score"] >= 5:
item["priority"] = "medium"
else:
item["priority"] = "low"
def _add_debt_item(self, debt_type: str, description: str, file_path: str,
severity: str, metadata: Dict[str, Any]):
"""Add a debt item to the inventory."""
item = {
"id": f"DEBT-{len(self.debt_items) + 1:04d}",
"type": debt_type,
"description": description,
"file_path": file_path,
"severity": severity,
"metadata": metadata,
"detected_date": datetime.now().isoformat(),
"status": "identified"
}
self.debt_items.append(item)
self.stats[f"debt_{debt_type}"] += 1
self.stats["total_debt_items"] += 1
if file_path in self.file_stats:
self.file_stats[file_path]["debt_count"] += 1
def _generate_report(self, directory: str) -> Dict[str, Any]:
"""Generate the final debt report."""
# Sort debt items by priority score
self.debt_items.sort(key=lambda x: x.get("priority_score", 0), reverse=True)
# Calculate summary statistics
priority_counts = Counter(item["priority"] for item in self.debt_items)
type_counts = Counter(item["type"] for item in self.debt_items)
# Calculate health score (0-100, higher is better)
total_files = self.stats.get("files_scanned", 1)
debt_density = len(self.debt_items) / total_files
health_score = max(0, 100 - (debt_density * 10))
report = {
"scan_metadata": {
"directory": directory,
"scan_date": datetime.now().isoformat(),
"scanner_version": "1.0.0",
"config": self.config
},
"summary": {
"total_files_scanned": self.stats.get("files_scanned", 0),
"total_lines_scanned": self.stats.get("total_lines", 0),
"total_debt_items": len(self.debt_items),
"health_score": round(health_score, 1),
"debt_density": round(debt_density, 2),
"priority_breakdown": dict(priority_counts),
"type_breakdown": dict(type_counts)
},
"debt_items": self.debt_items,
"file_statistics": self.file_stats,
"recommendations": self._generate_recommendations()
}
return report
def _generate_recommendations(self) -> List[str]:
"""Generate actionable recommendations based on findings."""
recommendations = []
# Priority-based recommendations
high_priority_count = len([item for item in self.debt_items
if item.get("priority") in ["critical", "high"]])
if high_priority_count > 10:
recommendations.append(
f"Address {high_priority_count} high-priority debt items immediately - "
"they pose significant risk to code quality and maintainability."
)
# Type-specific recommendations
type_counts = Counter(item["type"] for item in self.debt_items)
if type_counts.get("large_function", 0) > 5:
recommendations.append(
"Consider refactoring large functions into smaller, more focused units. "
"This will improve readability and testability."
)
if type_counts.get("duplicate_code", 0) > 3:
recommendations.append(
"Extract duplicate code into reusable functions or modules. "
"This reduces maintenance burden and potential for inconsistent changes."
)
if type_counts.get("todo_comment", 0) > 20:
recommendations.append(
"Review and address TODO/FIXME comments. Consider creating proper "
"tickets for substantial work items."
)
# General recommendations
total_files = self.stats.get("files_scanned", 1)
if len(self.debt_items) / total_files > 2:
recommendations.append(
"High debt density detected. Consider establishing coding standards "
"and regular code review processes to prevent debt accumulation."
)
if not recommendations:
recommendations.append("Code quality looks good! Continue current practices.")
return recommendations
class PythonASTAnalyzer(ast.NodeVisitor):
"""AST analyzer for Python-specific debt detection."""
def __init__(self, config: Dict[str, Any]):
self.config = config
self.debt_items = []
self.current_file = ""
self.lines = []
self.function_stack = []
def analyze(self, tree: ast.AST, file_path: str, lines: List[str]) -> List[Dict[str, Any]]:
"""Analyze Python AST for tech debt."""
self.debt_items = []
self.current_file = file_path
self.lines = lines
self.function_stack = []
self.visit(tree)
return self.debt_items
def visit_FunctionDef(self, node: ast.FunctionDef):
"""Analyze function definitions."""
self.function_stack.append(node.name)
# Calculate function length
func_length = node.end_lineno - node.lineno + 1
if func_length > self.config["max_function_length"]:
self._add_debt(
"large_function",
f"Function '{node.name}' is too long: {func_length} lines",
node.lineno,
"medium",
{"function_name": node.name, "length": func_length}
)
# Check for missing docstring
if not ast.get_docstring(node):
self._add_debt(
"missing_docstring",
f"Function '{node.name}' missing docstring",
node.lineno,
"low",
{"function_name": node.name}
)
# Calculate cyclomatic complexity
complexity = self._calculate_complexity(node)
if complexity > self.config["max_complexity"]:
self._add_debt(
"high_complexity",
f"Function '{node.name}' has high complexity: {complexity}",
node.lineno,
"high",
{"function_name": node.name, "complexity": complexity}
)
# Check parameter count
param_count = len(node.args.args)
if param_count > 5:
self._add_debt(
"too_many_parameters",
f"Function '{node.name}' has too many parameters: {param_count}",
node.lineno,
"medium",
{"function_name": node.name, "parameter_count": param_count}
)
self.generic_visit(node)
self.function_stack.pop()
def visit_ClassDef(self, node: ast.ClassDef):
"""Analyze class definitions."""
# Check for missing docstring
if not ast.get_docstring(node):
self._add_debt(
"missing_docstring",
f"Class '{node.name}' missing docstring",
node.lineno,
"low",
{"class_name": node.name}
)
# Check for too many methods
methods = [n for n in node.body if isinstance(n, ast.FunctionDef)]
if len(methods) > 20:
self._add_debt(
"large_class",
f"Class '{node.name}' has too many methods: {len(methods)}",
node.lineno,
"medium",
{"class_name": node.name, "method_count": len(methods)}
)
self.generic_visit(node)
def _calculate_complexity(self, node: ast.FunctionDef) -> int:
"""Calculate cyclomatic complexity of a function."""
complexity = 1 # Base complexity
for child in ast.walk(node):
if isinstance(child, (ast.If, ast.While, ast.For, ast.AsyncFor)):
complexity += 1
elif isinstance(child, ast.ExceptHandler):
complexity += 1
elif isinstance(child, ast.BoolOp):
complexity += len(child.values) - 1
return complexity
def _add_debt(self, debt_type: str, description: str, line_number: int,
severity: str, metadata: Dict[str, Any]):
"""Add a debt item to the collection."""
item = {
"id": f"DEBT-{len(self.debt_items) + 1:04d}",
"type": debt_type,
"description": description,
"file_path": self.current_file,
"line_number": line_number,
"severity": severity,
"metadata": metadata,
"detected_date": datetime.now().isoformat(),
"status": "identified"
}
self.debt_items.append(item)
def format_human_readable_report(report: Dict[str, Any]) -> str:
"""Format the report in human-readable format."""
output = []
# Header
output.append("=" * 60)
output.append("TECHNICAL DEBT SCAN REPORT")
output.append("=" * 60)
output.append(f"Directory: {report['scan_metadata']['directory']}")
output.append(f"Scan Date: {report['scan_metadata']['scan_date']}")
output.append(f"Scanner Version: {report['scan_metadata']['scanner_version']}")
output.append("")
# Summary
summary = report["summary"]
output.append("SUMMARY")
output.append("-" * 30)
output.append(f"Files Scanned: {summary['total_files_scanned']}")
output.append(f"Lines Scanned: {summary['total_lines_scanned']:,}")
output.append(f"Total Debt Items: {summary['total_debt_items']}")
output.append(f"Health Score: {summary['health_score']}/100")
output.append(f"Debt Density: {summary['debt_density']} items/file")
output.append("")
# Priority breakdown
output.append("PRIORITY BREAKDOWN")
output.append("-" * 30)
for priority, count in summary["priority_breakdown"].items():
output.append(f"{priority.capitalize()}: {count}")
output.append("")
# Top debt items
output.append("TOP DEBT ITEMS")
output.append("-" * 30)
top_items = report["debt_items"][:10]
for i, item in enumerate(top_items, 1):
output.append(f"{i}. [{item['priority'].upper()}] {item['description']}")
output.append(f" File: {item['file_path']}")
if 'line_number' in item:
output.append(f" Line: {item['line_number']}")
output.append("")
# Recommendations
output.append("RECOMMENDATIONS")
output.append("-" * 30)
for i, rec in enumerate(report["recommendations"], 1):
output.append(f"{i}. {rec}")
output.append("")
return "\n".join(output)
def main():
"""Main entry point for the debt scanner."""
parser = argparse.ArgumentParser(description="Scan codebase for technical debt")
parser.add_argument("directory", help="Directory to scan")
parser.add_argument("--config", help="Configuration file (JSON)")
parser.add_argument("--output", help="Output file path")
parser.add_argument("--format", choices=["json", "text", "both"],
default="both", help="Output format")
args = parser.parse_args()
# Load configuration
config = None
if args.config:
try:
with open(args.config, 'r') as f:
config = json.load(f)
except Exception as e:
print(f"Error loading config: {e}")
sys.exit(1)
# Run scan
scanner = DebtScanner(config)
try:
report = scanner.scan_directory(args.directory)
except Exception as e:
print(f"Scan failed: {e}")
sys.exit(1)
# Output results
if args.format in ["json", "both"]:
json_output = json.dumps(report, indent=2, default=str)
if args.output:
output_path = args.output if args.output.endswith('.json') else f"{args.output}.json"
with open(output_path, 'w') as f:
f.write(json_output)
print(f"JSON report written to: {output_path}")
else:
print("\nJSON REPORT:")
print("=" * 50)
print(json_output)
if args.format in ["text", "both"]:
text_output = format_human_readable_report(report)
if args.output:
output_path = args.output if args.output.endswith('.txt') else f"{args.output}.txt"
with open(output_path, 'w') as f:
f.write(text_output)
print(f"Text report written to: {output_path}")
else:
print("\nTEXT REPORT:")
print("=" * 50)
print(text_output)
if __name__ == "__main__":
main()