CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/tech-debt-tracker/scripts/debt_scanner.py

#!/usr/bin/env python3
"""
Tech Debt Scanner

Scans a codebase directory for tech debt signals using AST parsing (Python) and
regex patterns (any language). Detects various forms of technical debt and generates
both JSON inventory and human-readable reports.

Usage:
    python debt_scanner.py /path/to/codebase
    python debt_scanner.py /path/to/codebase --config config.json
    python debt_scanner.py /path/to/codebase --output report.json --format both
"""

import ast
import json
import argparse
import os
import re
import sys
from collections import defaultdict, Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Set, Tuple


class DebtScanner:
    """Main scanner class for detecting technical debt in codebases."""

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = self._load_default_config()
        if config:
            self.config.update(config)

        self.debt_items = []
        self.stats = defaultdict(int)
        self.file_stats = {}

        # Compile regex patterns for performance
        self._compile_patterns()

    def _load_default_config(self) -> Dict[str, Any]:
        """Load default configuration for debt detection."""
        return {
            "max_function_length": 50,
            "max_complexity": 10,
            "max_nesting_depth": 4,
            "max_file_size_lines": 500,
            "min_duplicate_lines": 3,
            "ignore_patterns": [
                "*.pyc", "__pycache__", ".git", ".svn", "node_modules",
                "build", "dist", "*.min.js", "*.map"
            ],
            "file_extensions": {
                "python": [".py"],
                "javascript": [".js", ".jsx", ".ts", ".tsx"],
                "java": [".java"],
                "csharp": [".cs"],
                "cpp": [".cpp", ".cc", ".cxx", ".c", ".h", ".hpp"],
                "ruby": [".rb"],
                "php": [".php"],
                "go": [".go"],
                "rust": [".rs"],
                "kotlin": [".kt"]
            },
            "comment_patterns": {
                "todo": r"(?i)(TODO|FIXME|HACK|XXX|BUG)[\s:]*(.+)",
                "commented_code": r"^\s*#.*[=(){}\[\];].*",
                "magic_numbers": r"\b\d{2,}\b",
                "long_strings": r'["\'](.{100,})["\']'
            },
            "severity_weights": {
                "critical": 10,
                "high": 7,
                "medium": 5,
                "low": 2,
                "info": 1
            }
        }

    def _compile_patterns(self):
        """Compile regex patterns for better performance."""
        self.comment_regexes = {}
        for name, pattern in self.config["comment_patterns"].items():
            self.comment_regexes[name] = re.compile(pattern)

        # Common code smells patterns
        self.smell_patterns = {
            "empty_catch": re.compile(r"except[^:]*:\s*pass\s*$", re.MULTILINE),
            "print_debug": re.compile(r"print\s*\([^)]*debug[^)]*\)", re.IGNORECASE),
            "hardcoded_paths": re.compile(r'["\'][/\\][^"\']*[/\\][^"\']*["\']'),
            "sql_injection_risk": re.compile(r'["\'].*%s.*["\'].*execute', re.IGNORECASE),
        }

    def scan_directory(self, directory: str) -> Dict[str, Any]:
        """
        Scan a directory for tech debt.

        Args:
            directory: Path to the directory to scan

        Returns:
            Dictionary containing debt inventory and statistics
        """
        directory_path = Path(directory)
        if not directory_path.exists():
            raise ValueError(f"Directory does not exist: {directory}")

        print(f"Scanning directory: {directory}")
        print("=" * 50)

        # Reset state
        self.debt_items = []
        self.stats = defaultdict(int)
        self.file_stats = {}

        # Walk through directory
        for root, dirs, files in os.walk(directory):
            # Filter out ignored directories
            dirs[:] = [d for d in dirs if not self._should_ignore(d)]

            for file in files:
                if self._should_ignore(file):
                    continue

                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, directory)

                try:
                    self._scan_file(file_path, relative_path)
                except Exception as e:
                    print(f"Error scanning {relative_path}: {e}")
                    self.stats["scan_errors"] += 1

        # Post-process results
        self._detect_duplicates(directory)
        self._calculate_priorities()

        return self._generate_report(directory)

    def _should_ignore(self, name: str) -> bool:
        """Check if file/directory should be ignored."""
        for pattern in self.config["ignore_patterns"]:
            if "*" in pattern:
                if re.match(pattern.replace("*", ".*"), name):
                    return True
            elif pattern in name:
                return True
        return False

    def _scan_file(self, file_path: str, relative_path: str):
        """Scan a single file for tech debt."""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                lines = content.splitlines()
        except Exception as e:
            print(f"Cannot read {relative_path}: {e}")
            return

        file_ext = Path(file_path).suffix.lower()
        file_info = {
            "path": relative_path,
            "lines": len(lines),
            "size_kb": os.path.getsize(file_path) / 1024,
            "language": self._detect_language(file_ext),
            "debt_count": 0
        }

        self.stats["files_scanned"] += 1
        self.stats["total_lines"] += len(lines)

        # File size debt
        if len(lines) > self.config["max_file_size_lines"]:
            self._add_debt_item(
                "large_file",
                f"File is too large: {len(lines)} lines",
                relative_path,
                "medium",
                {"lines": len(lines), "recommended_max": self.config["max_file_size_lines"]}
            )
            file_info["debt_count"] += 1

        # Language-specific analysis
        if file_info["language"] == "python" and file_ext == ".py":
            self._scan_python_file(relative_path, content, lines)
        else:
            self._scan_generic_file(relative_path, content, lines, file_info["language"])

        # Common patterns for all languages
        self._scan_common_patterns(relative_path, content, lines)

        self.file_stats[relative_path] = file_info

    def _detect_language(self, file_ext: str) -> str:
        """Detect programming language from file extension."""
        for lang, extensions in self.config["file_extensions"].items():
            if file_ext in extensions:
                return lang
        return "unknown"

    def _scan_python_file(self, file_path: str, content: str, lines: List[str]):
        """Scan Python files using AST parsing."""
        try:
            tree = ast.parse(content)
            analyzer = PythonASTAnalyzer(self.config)
            debt_items = analyzer.analyze(tree, file_path, lines)
            self.debt_items.extend(debt_items)
            self.stats["python_files"] += 1
        except SyntaxError as e:
            self._add_debt_item(
                "syntax_error",
                f"Python syntax error: {e}",
                file_path,
                "high",
                {"line": e.lineno, "error": str(e)}
            )

    def _scan_generic_file(self, file_path: str, content: str, lines: List[str], language: str):
        """Scan non-Python files using pattern matching."""
        # Detect long lines
        for i, line in enumerate(lines):
            if len(line) > 120:
                self._add_debt_item(
                    "long_line",
                    f"Line too long: {len(line)} characters",
                    file_path,
                    "low",
                    {"line_number": i + 1, "length": len(line)}
                )

        # Detect deep nesting (approximate)
        for i, line in enumerate(lines):
            indent_level = len(line) - len(line.lstrip())
            if language in ["python"]:
                indent_level = indent_level // 4  # Python uses 4-space indents
            elif language in ["javascript", "java", "csharp", "cpp"]:
                # Count braces for brace-based languages
                brace_level = content[:content.find('\n'.join(lines[:i+1]))].count('{') - content[:content.find('\n'.join(lines[:i+1]))].count('}')
                if brace_level > self.config["max_nesting_depth"]:
                    self._add_debt_item(
                        "deep_nesting",
                        f"Deep nesting detected: {brace_level} levels",
                        file_path,
                        "medium",
                        {"line_number": i + 1, "nesting_level": brace_level}
                    )

    def _scan_common_patterns(self, file_path: str, content: str, lines: List[str]):
        """Scan for common patterns across all file types."""
        # TODO/FIXME comments
        for i, line in enumerate(lines):
            for pattern_name, regex in self.comment_regexes.items():
                match = regex.search(line)
                if match:
                    if pattern_name == "todo":
                        self._add_debt_item(
                            "todo_comment",
                            f"TODO/FIXME comment: {match.group(0)}",
                            file_path,
                            "low",
                            {"line_number": i + 1, "comment": match.group(0).strip()}
                        )

        # Code smells
        for smell_name, pattern in self.smell_patterns.items():
            matches = pattern.finditer(content)
            for match in matches:
                line_num = content[:match.start()].count('\n') + 1
                self._add_debt_item(
                    smell_name,
                    f"Code smell detected: {smell_name}",
                    file_path,
                    "medium",
                    {"line_number": line_num, "pattern": match.group(0)[:100]}
                )

    def _detect_duplicates(self, directory: str):
        """Detect duplicate code blocks across files."""
        # Simple duplicate detection based on exact line matches
        line_hashes = defaultdict(list)

        for file_path, file_info in self.file_stats.items():
            try:
                full_path = os.path.join(directory, file_path)
                with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
                    lines = f.readlines()

                for i in range(len(lines) - self.config["min_duplicate_lines"] + 1):
                    block = ''.join(lines[i:i + self.config["min_duplicate_lines"]])
                    block_hash = hash(block.strip())
                    if len(block.strip()) > 50:  # Only consider substantial blocks
                        line_hashes[block_hash].append((file_path, i + 1, block))
            except Exception:
                continue

        # Report duplicates
        for block_hash, occurrences in line_hashes.items():
            if len(occurrences) > 1:
                for file_path, line_num, block in occurrences:
                    self._add_debt_item(
                        "duplicate_code",
                        f"Duplicate code block found in {len(occurrences)} files",
                        file_path,
                        "medium",
                        {
                            "line_number": line_num,
                            "duplicate_count": len(occurrences),
                            "other_files": [f[0] for f in occurrences if f[0] != file_path]
                        }
                    )

    def _calculate_priorities(self):
        """Calculate priority scores for debt items."""
        severity_weights = self.config["severity_weights"]

        for item in self.debt_items:
            base_score = severity_weights.get(item["severity"], 1)

            # Adjust based on debt type
            type_multipliers = {
                "syntax_error": 2.0,
                "security_risk": 1.8,
                "large_function": 1.5,
                "high_complexity": 1.4,
                "duplicate_code": 1.3,
                "todo_comment": 0.5
            }

            multiplier = type_multipliers.get(item["type"], 1.0)
            item["priority_score"] = int(base_score * multiplier)

            # Set priority category
            if item["priority_score"] >= 15:
                item["priority"] = "critical"
            elif item["priority_score"] >= 10:
                item["priority"] = "high"
            elif item["priority_score"] >= 5:
                item["priority"] = "medium"
            else:
                item["priority"] = "low"

    def _add_debt_item(self, debt_type: str, description: str, file_path: str,
                      severity: str, metadata: Dict[str, Any]):
        """Add a debt item to the inventory."""
        item = {
            "id": f"DEBT-{len(self.debt_items) + 1:04d}",
            "type": debt_type,
            "description": description,
            "file_path": file_path,
            "severity": severity,
            "metadata": metadata,
            "detected_date": datetime.now().isoformat(),
            "status": "identified"
        }

        self.debt_items.append(item)
        self.stats[f"debt_{debt_type}"] += 1
        self.stats["total_debt_items"] += 1

        if file_path in self.file_stats:
            self.file_stats[file_path]["debt_count"] += 1

    def _generate_report(self, directory: str) -> Dict[str, Any]:
        """Generate the final debt report."""
        # Sort debt items by priority score
        self.debt_items.sort(key=lambda x: x.get("priority_score", 0), reverse=True)

        # Calculate summary statistics
        priority_counts = Counter(item["priority"] for item in self.debt_items)
        type_counts = Counter(item["type"] for item in self.debt_items)

        # Calculate health score (0-100, higher is better)
        total_files = self.stats.get("files_scanned", 1)
        debt_density = len(self.debt_items) / total_files
        health_score = max(0, 100 - (debt_density * 10))

        report = {
            "scan_metadata": {
                "directory": directory,
                "scan_date": datetime.now().isoformat(),
                "scanner_version": "1.0.0",
                "config": self.config
            },
            "summary": {
                "total_files_scanned": self.stats.get("files_scanned", 0),
                "total_lines_scanned": self.stats.get("total_lines", 0),
                "total_debt_items": len(self.debt_items),
                "health_score": round(health_score, 1),
                "debt_density": round(debt_density, 2),
                "priority_breakdown": dict(priority_counts),
                "type_breakdown": dict(type_counts)
            },
            "debt_items": self.debt_items,
            "file_statistics": self.file_stats,
            "recommendations": self._generate_recommendations()
        }

        return report

    def _generate_recommendations(self) -> List[str]:
        """Generate actionable recommendations based on findings."""
        recommendations = []

        # Priority-based recommendations
        high_priority_count = len([item for item in self.debt_items
                                  if item.get("priority") in ["critical", "high"]])

        if high_priority_count > 10:
            recommendations.append(
                f"Address {high_priority_count} high-priority debt items immediately - "
                "they pose significant risk to code quality and maintainability."
            )

        # Type-specific recommendations
        type_counts = Counter(item["type"] for item in self.debt_items)

        if type_counts.get("large_function", 0) > 5:
            recommendations.append(
                "Consider refactoring large functions into smaller, more focused units. "
                "This will improve readability and testability."
            )

        if type_counts.get("duplicate_code", 0) > 3:
            recommendations.append(
                "Extract duplicate code into reusable functions or modules. "
                "This reduces maintenance burden and potential for inconsistent changes."
            )

        if type_counts.get("todo_comment", 0) > 20:
            recommendations.append(
                "Review and address TODO/FIXME comments. Consider creating proper "
                "tickets for substantial work items."
            )

        # General recommendations
        total_files = self.stats.get("files_scanned", 1)
        if len(self.debt_items) / total_files > 2:
            recommendations.append(
                "High debt density detected. Consider establishing coding standards "
                "and regular code review processes to prevent debt accumulation."
            )

        if not recommendations:
            recommendations.append("Code quality looks good! Continue current practices.")

        return recommendations


class PythonASTAnalyzer(ast.NodeVisitor):
    """AST analyzer for Python-specific debt detection."""

    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.debt_items = []
        self.current_file = ""
        self.lines = []
        self.function_stack = []

    def analyze(self, tree: ast.AST, file_path: str, lines: List[str]) -> List[Dict[str, Any]]:
        """Analyze Python AST for tech debt."""
        self.debt_items = []
        self.current_file = file_path
        self.lines = lines
        self.function_stack = []

        self.visit(tree)
        return self.debt_items

    def visit_FunctionDef(self, node: ast.FunctionDef):
        """Analyze function definitions."""
        self.function_stack.append(node.name)

        # Calculate function length
        func_length = node.end_lineno - node.lineno + 1
        if func_length > self.config["max_function_length"]:
            self._add_debt(
                "large_function",
                f"Function '{node.name}' is too long: {func_length} lines",
                node.lineno,
                "medium",
                {"function_name": node.name, "length": func_length}
            )

        # Check for missing docstring
        if not ast.get_docstring(node):
            self._add_debt(
                "missing_docstring",
                f"Function '{node.name}' missing docstring",
                node.lineno,
                "low",
                {"function_name": node.name}
            )

        # Calculate cyclomatic complexity
        complexity = self._calculate_complexity(node)
        if complexity > self.config["max_complexity"]:
            self._add_debt(
                "high_complexity",
                f"Function '{node.name}' has high complexity: {complexity}",
                node.lineno,
                "high",
                {"function_name": node.name, "complexity": complexity}
            )

        # Check parameter count
        param_count = len(node.args.args)
        if param_count > 5:
            self._add_debt(
                "too_many_parameters",
                f"Function '{node.name}' has too many parameters: {param_count}",
                node.lineno,
                "medium",
                {"function_name": node.name, "parameter_count": param_count}
            )

        self.generic_visit(node)
        self.function_stack.pop()

    def visit_ClassDef(self, node: ast.ClassDef):
        """Analyze class definitions."""
        # Check for missing docstring
        if not ast.get_docstring(node):
            self._add_debt(
                "missing_docstring",
                f"Class '{node.name}' missing docstring",
                node.lineno,
                "low",
                {"class_name": node.name}
            )

        # Check for too many methods
        methods = [n for n in node.body if isinstance(n, ast.FunctionDef)]
        if len(methods) > 20:
            self._add_debt(
                "large_class",
                f"Class '{node.name}' has too many methods: {len(methods)}",
                node.lineno,
                "medium",
                {"class_name": node.name, "method_count": len(methods)}
            )

        self.generic_visit(node)

    def _calculate_complexity(self, node: ast.FunctionDef) -> int:
        """Calculate cyclomatic complexity of a function."""
        complexity = 1  # Base complexity

        for child in ast.walk(node):
            if isinstance(child, (ast.If, ast.While, ast.For, ast.AsyncFor)):
                complexity += 1
            elif isinstance(child, ast.ExceptHandler):
                complexity += 1
            elif isinstance(child, ast.BoolOp):
                complexity += len(child.values) - 1

        return complexity

    def _add_debt(self, debt_type: str, description: str, line_number: int,
                 severity: str, metadata: Dict[str, Any]):
        """Add a debt item to the collection."""
        item = {
            "id": f"DEBT-{len(self.debt_items) + 1:04d}",
            "type": debt_type,
            "description": description,
            "file_path": self.current_file,
            "line_number": line_number,
            "severity": severity,
            "metadata": metadata,
            "detected_date": datetime.now().isoformat(),
            "status": "identified"
        }
        self.debt_items.append(item)


def format_human_readable_report(report: Dict[str, Any]) -> str:
    """Format the report in human-readable format."""
    output = []

    # Header
    output.append("=" * 60)
    output.append("TECHNICAL DEBT SCAN REPORT")
    output.append("=" * 60)
    output.append(f"Directory: {report['scan_metadata']['directory']}")
    output.append(f"Scan Date: {report['scan_metadata']['scan_date']}")
    output.append(f"Scanner Version: {report['scan_metadata']['scanner_version']}")
    output.append("")

    # Summary
    summary = report["summary"]
    output.append("SUMMARY")
    output.append("-" * 30)
    output.append(f"Files Scanned: {summary['total_files_scanned']}")
    output.append(f"Lines Scanned: {summary['total_lines_scanned']:,}")
    output.append(f"Total Debt Items: {summary['total_debt_items']}")
    output.append(f"Health Score: {summary['health_score']}/100")
    output.append(f"Debt Density: {summary['debt_density']} items/file")
    output.append("")

    # Priority breakdown
    output.append("PRIORITY BREAKDOWN")
    output.append("-" * 30)
    for priority, count in summary["priority_breakdown"].items():
        output.append(f"{priority.capitalize()}: {count}")
    output.append("")

    # Top debt items
    output.append("TOP DEBT ITEMS")
    output.append("-" * 30)
    top_items = report["debt_items"][:10]
    for i, item in enumerate(top_items, 1):
        output.append(f"{i}. [{item['priority'].upper()}] {item['description']}")
        output.append(f"   File: {item['file_path']}")
        if 'line_number' in item:
            output.append(f"   Line: {item['line_number']}")
        output.append("")

    # Recommendations
    output.append("RECOMMENDATIONS")
    output.append("-" * 30)
    for i, rec in enumerate(report["recommendations"], 1):
        output.append(f"{i}. {rec}")
        output.append("")

    return "\n".join(output)


def main():
    """Main entry point for the debt scanner."""
    parser = argparse.ArgumentParser(description="Scan codebase for technical debt")
    parser.add_argument("directory", help="Directory to scan")
    parser.add_argument("--config", help="Configuration file (JSON)")
    parser.add_argument("--output", help="Output file path")
    parser.add_argument("--format", choices=["json", "text", "both"],
                       default="both", help="Output format")

    args = parser.parse_args()

    # Load configuration
    config = None
    if args.config:
        try:
            with open(args.config, 'r') as f:
                config = json.load(f)
        except Exception as e:
            print(f"Error loading config: {e}")
            sys.exit(1)

    # Run scan
    scanner = DebtScanner(config)
    try:
        report = scanner.scan_directory(args.directory)
    except Exception as e:
        print(f"Scan failed: {e}")
        sys.exit(1)

    # Output results
    if args.format in ["json", "both"]:
        json_output = json.dumps(report, indent=2, default=str)
        if args.output:
            output_path = args.output if args.output.endswith('.json') else f"{args.output}.json"
            with open(output_path, 'w') as f:
                f.write(json_output)
            print(f"JSON report written to: {output_path}")
        else:
            print("\nJSON REPORT:")
            print("=" * 50)
            print(json_output)

    if args.format in ["text", "both"]:
        text_output = format_human_readable_report(report)
        if args.output:
            output_path = args.output if args.output.endswith('.txt') else f"{args.output}.txt"
            with open(output_path, 'w') as f:
                f.write(text_output)
            print(f"Text report written to: {output_path}")
        else:
            print("\nTEXT REPORT:")
            print("=" * 50)
            print(text_output)


if __name__ == "__main__":
    main()