Files
CleanArchitecture-template/.brain/.agent/skills/engineering-advanced-skills/interview-system-designer/hiring_calibrator.py
2026-03-12 15:17:52 +07:00

1306 lines
61 KiB
Python

#!/usr/bin/env python3
"""
Hiring Calibrator
Analyzes interview scores from multiple candidates and interviewers to detect bias,
calibration issues, and inconsistent rubric application. Generates calibration reports
with specific recommendations for interviewer coaching and process improvements.
Usage:
python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive
python hiring_calibrator.py --input data.json --competencies technical,leadership --output report.json
python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly
"""
import os
import sys
import json
import argparse
import statistics
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple
from collections import defaultdict, Counter
import math
class HiringCalibrator:
"""Analyzes interview data for bias detection and calibration issues."""
def __init__(self):
self.bias_thresholds = self._init_bias_thresholds()
self.calibration_standards = self._init_calibration_standards()
self.demographic_categories = self._init_demographic_categories()
def _init_bias_thresholds(self) -> Dict[str, float]:
"""Initialize statistical thresholds for bias detection."""
return {
"score_variance_threshold": 1.5, # Standard deviations
"pass_rate_difference_threshold": 0.15, # 15% difference
"interviewer_consistency_threshold": 0.8, # Correlation coefficient
"demographic_parity_threshold": 0.10, # 10% difference
"score_inflation_threshold": 0.3, # 30% above historical average
"score_deflation_threshold": 0.3, # 30% below historical average
"minimum_sample_size": 5 # Minimum candidates per analysis
}
def _init_calibration_standards(self) -> Dict[str, Dict]:
"""Initialize expected calibration standards."""
return {
"score_distribution": {
"target_mean": 2.8, # Expected average score (1-4 scale)
"target_std": 0.9, # Expected standard deviation
"expected_distribution": {
"1": 0.10, # 10% score 1 (does not meet)
"2": 0.25, # 25% score 2 (partially meets)
"3": 0.45, # 45% score 3 (meets expectations)
"4": 0.20 # 20% score 4 (exceeds expectations)
}
},
"interviewer_agreement": {
"minimum_correlation": 0.70, # Minimum correlation between interviewers
"maximum_std_deviation": 0.8, # Maximum std dev in scores for same candidate
"agreement_threshold": 0.75 # % of time interviewers should agree within 1 point
},
"pass_rates": {
"junior_level": 0.25, # 25% pass rate for junior roles
"mid_level": 0.20, # 20% pass rate for mid roles
"senior_level": 0.15, # 15% pass rate for senior roles
"staff_level": 0.10, # 10% pass rate for staff+ roles
"leadership": 0.12 # 12% pass rate for leadership roles
}
}
def _init_demographic_categories(self) -> List[str]:
"""Initialize demographic categories to analyze for bias."""
return [
"gender", "ethnicity", "education_level", "previous_company_size",
"years_experience", "university_tier", "geographic_location"
]
def analyze_hiring_calibration(self, interview_data: List[Dict[str, Any]],
analysis_type: str = "comprehensive",
competencies: Optional[List[str]] = None,
trend_analysis: bool = False,
period: str = "monthly") -> Dict[str, Any]:
"""Perform comprehensive hiring calibration analysis."""
# Validate and preprocess data
processed_data = self._preprocess_interview_data(interview_data)
if len(processed_data) < self.bias_thresholds["minimum_sample_size"]:
return {
"error": "Insufficient data for analysis",
"minimum_required": self.bias_thresholds["minimum_sample_size"],
"actual_samples": len(processed_data)
}
# Perform different types of analysis based on request
analysis_results = {
"analysis_type": analysis_type,
"data_summary": self._generate_data_summary(processed_data),
"generated_at": datetime.now().isoformat()
}
if analysis_type in ["comprehensive", "bias"]:
analysis_results["bias_analysis"] = self._analyze_bias_patterns(processed_data, competencies)
if analysis_type in ["comprehensive", "calibration"]:
analysis_results["calibration_analysis"] = self._analyze_calibration_consistency(processed_data, competencies)
if analysis_type in ["comprehensive", "interviewer"]:
analysis_results["interviewer_analysis"] = self._analyze_interviewer_bias(processed_data)
if analysis_type in ["comprehensive", "scoring"]:
analysis_results["scoring_analysis"] = self._analyze_scoring_patterns(processed_data, competencies)
if trend_analysis:
analysis_results["trend_analysis"] = self._analyze_trends_over_time(processed_data, period)
# Generate recommendations
analysis_results["recommendations"] = self._generate_recommendations(analysis_results)
# Calculate overall calibration health score
analysis_results["calibration_health_score"] = self._calculate_health_score(analysis_results)
return analysis_results
def _preprocess_interview_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Clean and validate interview data."""
processed_data = []
for record in raw_data:
if self._validate_interview_record(record):
processed_record = self._standardize_record(record)
processed_data.append(processed_record)
return processed_data
def _validate_interview_record(self, record: Dict[str, Any]) -> bool:
"""Validate that an interview record has required fields."""
required_fields = ["candidate_id", "interviewer_id", "scores", "overall_recommendation", "date"]
for field in required_fields:
if field not in record or record[field] is None:
return False
# Validate scores format
if not isinstance(record["scores"], dict):
return False
# Validate score values are numeric and in valid range (1-4)
for competency, score in record["scores"].items():
if not isinstance(score, (int, float)) or not (1 <= score <= 4):
return False
return True
def _standardize_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
"""Standardize record format and add computed fields."""
standardized = record.copy()
# Calculate average score
scores = list(record["scores"].values())
standardized["average_score"] = statistics.mean(scores)
# Standardize recommendation to binary
recommendation = record["overall_recommendation"].lower()
standardized["hire_decision"] = recommendation in ["hire", "strong hire", "yes"]
# Parse date if string
if isinstance(record["date"], str):
try:
standardized["date"] = datetime.fromisoformat(record["date"].replace("Z", "+00:00"))
except ValueError:
standardized["date"] = datetime.now()
# Add demographic info if available
for category in self.demographic_categories:
if category not in standardized:
standardized[category] = "unknown"
# Add level normalization
role = record.get("role", "").lower()
if any(level in role for level in ["junior", "associate", "entry"]):
standardized["normalized_level"] = "junior"
elif any(level in role for level in ["senior", "sr"]):
standardized["normalized_level"] = "senior"
elif any(level in role for level in ["staff", "principal", "lead"]):
standardized["normalized_level"] = "staff"
else:
standardized["normalized_level"] = "mid"
return standardized
def _generate_data_summary(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate summary statistics for the dataset."""
if not data:
return {}
total_candidates = len(data)
unique_interviewers = len(set(record["interviewer_id"] for record in data))
# Score statistics
all_scores = []
all_average_scores = []
hire_decisions = []
for record in data:
all_scores.extend(record["scores"].values())
all_average_scores.append(record["average_score"])
hire_decisions.append(record["hire_decision"])
# Date range
dates = [record["date"] for record in data if record["date"]]
date_range = {
"start_date": min(dates).isoformat() if dates else None,
"end_date": max(dates).isoformat() if dates else None,
"total_days": (max(dates) - min(dates)).days if len(dates) > 1 else 0
}
# Role distribution
roles = [record.get("role", "unknown") for record in data]
role_distribution = dict(Counter(roles))
return {
"total_candidates": total_candidates,
"unique_interviewers": unique_interviewers,
"candidates_per_interviewer": round(total_candidates / unique_interviewers, 2),
"date_range": date_range,
"score_statistics": {
"mean_individual_scores": round(statistics.mean(all_scores), 2),
"std_individual_scores": round(statistics.stdev(all_scores) if len(all_scores) > 1 else 0, 2),
"mean_average_scores": round(statistics.mean(all_average_scores), 2),
"std_average_scores": round(statistics.stdev(all_average_scores) if len(all_average_scores) > 1 else 0, 2)
},
"hire_rate": round(sum(hire_decisions) / len(hire_decisions), 3),
"role_distribution": role_distribution
}
def _analyze_bias_patterns(self, data: List[Dict[str, Any]],
target_competencies: Optional[List[str]]) -> Dict[str, Any]:
"""Analyze potential bias patterns in interview decisions."""
bias_analysis = {
"demographic_bias": {},
"interviewer_bias": {},
"competency_bias": {},
"overall_bias_score": 0
}
# Analyze demographic bias
for demographic in self.demographic_categories:
if all(record.get(demographic) == "unknown" for record in data):
continue
demographic_analysis = self._analyze_demographic_bias(data, demographic)
if demographic_analysis["bias_detected"]:
bias_analysis["demographic_bias"][demographic] = demographic_analysis
# Analyze interviewer bias
bias_analysis["interviewer_bias"] = self._analyze_interviewer_bias(data)
# Analyze competency bias if specified
if target_competencies:
bias_analysis["competency_bias"] = self._analyze_competency_bias(data, target_competencies)
# Calculate overall bias score
bias_analysis["overall_bias_score"] = self._calculate_bias_score(bias_analysis)
return bias_analysis
def _analyze_demographic_bias(self, data: List[Dict[str, Any]],
demographic: str) -> Dict[str, Any]:
"""Analyze bias for a specific demographic category."""
# Group data by demographic values
demographic_groups = defaultdict(list)
for record in data:
demo_value = record.get(demographic, "unknown")
if demo_value != "unknown":
demographic_groups[demo_value].append(record)
if len(demographic_groups) < 2:
return {"bias_detected": False, "reason": "insufficient_groups"}
# Calculate statistics for each group
group_stats = {}
for group, records in demographic_groups.items():
if len(records) >= self.bias_thresholds["minimum_sample_size"]:
scores = [r["average_score"] for r in records]
hire_rate = sum(r["hire_decision"] for r in records) / len(records)
group_stats[group] = {
"count": len(records),
"mean_score": statistics.mean(scores),
"hire_rate": hire_rate,
"std_score": statistics.stdev(scores) if len(scores) > 1 else 0
}
if len(group_stats) < 2:
return {"bias_detected": False, "reason": "insufficient_sample_sizes"}
# Detect statistical differences
bias_detected = False
bias_details = {}
# Check for significant differences in hire rates
hire_rates = [stats["hire_rate"] for stats in group_stats.values()]
max_hire_rate_diff = max(hire_rates) - min(hire_rates)
if max_hire_rate_diff > self.bias_thresholds["demographic_parity_threshold"]:
bias_detected = True
bias_details["hire_rate_disparity"] = {
"max_difference": round(max_hire_rate_diff, 3),
"threshold": self.bias_thresholds["demographic_parity_threshold"],
"group_stats": group_stats
}
# Check for significant differences in scoring
mean_scores = [stats["mean_score"] for stats in group_stats.values()]
max_score_diff = max(mean_scores) - min(mean_scores)
if max_score_diff > 0.5: # Half point difference threshold
bias_detected = True
bias_details["scoring_disparity"] = {
"max_difference": round(max_score_diff, 3),
"group_stats": group_stats
}
return {
"bias_detected": bias_detected,
"demographic": demographic,
"group_statistics": group_stats,
"bias_details": bias_details,
"recommendation": self._generate_demographic_bias_recommendation(demographic, bias_details) if bias_detected else None
}
def _analyze_interviewer_bias(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze bias patterns across different interviewers."""
interviewer_stats = defaultdict(list)
# Group by interviewer
for record in data:
interviewer_id = record["interviewer_id"]
interviewer_stats[interviewer_id].append(record)
# Calculate statistics per interviewer
interviewer_analysis = {}
for interviewer_id, records in interviewer_stats.items():
if len(records) >= self.bias_thresholds["minimum_sample_size"]:
scores = [r["average_score"] for r in records]
hire_rate = sum(r["hire_decision"] for r in records) / len(records)
interviewer_analysis[interviewer_id] = {
"total_interviews": len(records),
"mean_score": statistics.mean(scores),
"std_score": statistics.stdev(scores) if len(scores) > 1 else 0,
"hire_rate": hire_rate,
"score_inflation": self._detect_score_inflation(scores),
"consistency_score": self._calculate_interviewer_consistency(records)
}
# Identify outlier interviewers
if len(interviewer_analysis) > 1:
overall_mean_score = statistics.mean([stats["mean_score"] for stats in interviewer_analysis.values()])
overall_hire_rate = statistics.mean([stats["hire_rate"] for stats in interviewer_analysis.values()])
outlier_interviewers = {}
for interviewer_id, stats in interviewer_analysis.items():
issues = []
# Check for score inflation/deflation
if stats["mean_score"] > overall_mean_score * (1 + self.bias_thresholds["score_inflation_threshold"]):
issues.append("score_inflation")
elif stats["mean_score"] < overall_mean_score * (1 - self.bias_thresholds["score_deflation_threshold"]):
issues.append("score_deflation")
# Check for hire rate deviation
hire_rate_diff = abs(stats["hire_rate"] - overall_hire_rate)
if hire_rate_diff > self.bias_thresholds["pass_rate_difference_threshold"]:
issues.append("hire_rate_deviation")
# Check for low consistency
if stats["consistency_score"] < self.bias_thresholds["interviewer_consistency_threshold"]:
issues.append("low_consistency")
if issues:
outlier_interviewers[interviewer_id] = {
"issues": issues,
"statistics": stats,
"severity": len(issues) # More issues = higher severity
}
return {
"interviewer_statistics": interviewer_analysis,
"outlier_interviewers": outlier_interviewers if len(interviewer_analysis) > 1 else {},
"overall_consistency": self._calculate_overall_interviewer_consistency(data),
"recommendations": self._generate_interviewer_recommendations(outlier_interviewers if len(interviewer_analysis) > 1 else {})
}
def _analyze_competency_bias(self, data: List[Dict[str, Any]],
competencies: List[str]) -> Dict[str, Any]:
"""Analyze bias patterns within specific competencies."""
competency_analysis = {}
for competency in competencies:
# Extract scores for this competency
competency_scores = []
for record in data:
if competency in record["scores"]:
competency_scores.append({
"score": record["scores"][competency],
"interviewer": record["interviewer_id"],
"candidate": record["candidate_id"],
"overall_decision": record["hire_decision"]
})
if len(competency_scores) < self.bias_thresholds["minimum_sample_size"]:
continue
# Analyze scoring patterns
scores = [item["score"] for item in competency_scores]
score_variance = statistics.variance(scores) if len(scores) > 1 else 0
# Analyze by interviewer
interviewer_competency_scores = defaultdict(list)
for item in competency_scores:
interviewer_competency_scores[item["interviewer"]].append(item["score"])
interviewer_variations = {}
if len(interviewer_competency_scores) > 1:
interviewer_means = {interviewer: statistics.mean(scores)
for interviewer, scores in interviewer_competency_scores.items()
if len(scores) >= 3}
if len(interviewer_means) > 1:
mean_of_means = statistics.mean(interviewer_means.values())
for interviewer, mean_score in interviewer_means.items():
deviation = abs(mean_score - mean_of_means)
if deviation > 0.5: # More than half point deviation
interviewer_variations[interviewer] = {
"mean_score": round(mean_score, 2),
"deviation_from_average": round(deviation, 2),
"sample_size": len(interviewer_competency_scores[interviewer])
}
competency_analysis[competency] = {
"total_scores": len(competency_scores),
"mean_score": round(statistics.mean(scores), 2),
"score_variance": round(score_variance, 2),
"interviewer_variations": interviewer_variations,
"bias_detected": len(interviewer_variations) > 0
}
return competency_analysis
def _analyze_calibration_consistency(self, data: List[Dict[str, Any]],
target_competencies: Optional[List[str]]) -> Dict[str, Any]:
"""Analyze calibration consistency across interviews."""
# Group candidates by those interviewed by multiple people
candidate_interviewers = defaultdict(list)
for record in data:
candidate_interviewers[record["candidate_id"]].append(record)
multi_interviewer_candidates = {
candidate: records for candidate, records in candidate_interviewers.items()
if len(records) > 1
}
if not multi_interviewer_candidates:
return {
"error": "No candidates with multiple interviewers found",
"single_interviewer_analysis": self._analyze_single_interviewer_consistency(data)
}
# Calculate agreement statistics
agreement_stats = []
score_correlations = []
for candidate, records in multi_interviewer_candidates.items():
candidate_scores = []
interviewer_pairs = []
for record in records:
avg_score = record["average_score"]
candidate_scores.append(avg_score)
interviewer_pairs.append(record["interviewer_id"])
if len(candidate_scores) > 1:
# Calculate standard deviation of scores for this candidate
score_std = statistics.stdev(candidate_scores)
agreement_stats.append(score_std)
# Check if all interviewers agree within 1 point
score_range = max(candidate_scores) - min(candidate_scores)
agreement_within_one = score_range <= 1.0
score_correlations.append({
"candidate": candidate,
"scores": candidate_scores,
"interviewers": interviewer_pairs,
"score_std": score_std,
"score_range": score_range,
"agreement_within_one": agreement_within_one
})
# Calculate overall calibration metrics
mean_score_std = statistics.mean(agreement_stats) if agreement_stats else 0
agreement_rate = sum(1 for corr in score_correlations if corr["agreement_within_one"]) / len(score_correlations) if score_correlations else 0
calibration_quality = "good"
if mean_score_std > self.calibration_standards["interviewer_agreement"]["maximum_std_deviation"]:
calibration_quality = "poor"
elif agreement_rate < self.calibration_standards["interviewer_agreement"]["agreement_threshold"]:
calibration_quality = "fair"
return {
"multi_interviewer_candidates": len(multi_interviewer_candidates),
"mean_score_standard_deviation": round(mean_score_std, 3),
"agreement_within_one_point_rate": round(agreement_rate, 3),
"calibration_quality": calibration_quality,
"candidate_agreement_details": score_correlations,
"target_standards": self.calibration_standards["interviewer_agreement"],
"recommendations": self._generate_calibration_recommendations(mean_score_std, agreement_rate)
}
def _analyze_scoring_patterns(self, data: List[Dict[str, Any]],
target_competencies: Optional[List[str]]) -> Dict[str, Any]:
"""Analyze overall scoring patterns and distributions."""
# Overall score distribution
all_individual_scores = []
all_average_scores = []
score_distribution = defaultdict(int)
for record in data:
avg_score = record["average_score"]
all_average_scores.append(avg_score)
for competency, score in record["scores"].items():
if not target_competencies or competency in target_competencies:
all_individual_scores.append(score)
score_distribution[str(int(score))] += 1
# Calculate distribution percentages
total_scores = sum(score_distribution.values())
score_percentages = {score: count/total_scores for score, count in score_distribution.items()}
# Compare against expected distribution
expected_dist = self.calibration_standards["score_distribution"]["expected_distribution"]
distribution_analysis = {}
for score in ["1", "2", "3", "4"]:
expected_pct = expected_dist.get(score, 0)
actual_pct = score_percentages.get(score, 0)
difference = actual_pct - expected_pct
distribution_analysis[score] = {
"expected_percentage": expected_pct,
"actual_percentage": round(actual_pct, 3),
"difference": round(difference, 3),
"significant_deviation": abs(difference) > 0.05 # 5% threshold
}
# Calculate scoring statistics
mean_score = statistics.mean(all_individual_scores) if all_individual_scores else 0
std_score = statistics.stdev(all_individual_scores) if len(all_individual_scores) > 1 else 0
target_mean = self.calibration_standards["score_distribution"]["target_mean"]
target_std = self.calibration_standards["score_distribution"]["target_std"]
# Analyze pass rates by level
level_pass_rates = {}
level_groups = defaultdict(list)
for record in data:
level = record.get("normalized_level", "unknown")
level_groups[level].append(record["hire_decision"])
for level, decisions in level_groups.items():
if len(decisions) >= self.bias_thresholds["minimum_sample_size"]:
pass_rate = sum(decisions) / len(decisions)
expected_rate = self.calibration_standards["pass_rates"].get(f"{level}_level", 0.15)
level_pass_rates[level] = {
"actual_pass_rate": round(pass_rate, 3),
"expected_pass_rate": expected_rate,
"difference": round(pass_rate - expected_rate, 3),
"sample_size": len(decisions)
}
return {
"score_statistics": {
"mean_score": round(mean_score, 2),
"std_score": round(std_score, 2),
"target_mean": target_mean,
"target_std": target_std,
"mean_deviation": round(abs(mean_score - target_mean), 2),
"std_deviation": round(abs(std_score - target_std), 2)
},
"score_distribution": distribution_analysis,
"level_pass_rates": level_pass_rates,
"overall_assessment": self._assess_scoring_health(distribution_analysis, mean_score, target_mean)
}
def _analyze_trends_over_time(self, data: List[Dict[str, Any]], period: str) -> Dict[str, Any]:
"""Analyze trends in hiring patterns over time."""
# Sort data by date
dated_data = [record for record in data if record.get("date")]
dated_data.sort(key=lambda x: x["date"])
if len(dated_data) < 10: # Need minimum data for trend analysis
return {"error": "Insufficient data for trend analysis", "minimum_required": 10}
# Group by time period
period_groups = defaultdict(list)
for record in dated_data:
date = record["date"]
if period == "weekly":
period_key = date.strftime("%Y-W%U")
elif period == "monthly":
period_key = date.strftime("%Y-%m")
elif period == "quarterly":
quarter = (date.month - 1) // 3 + 1
period_key = f"{date.year}-Q{quarter}"
else: # daily
period_key = date.strftime("%Y-%m-%d")
period_groups[period_key].append(record)
# Calculate metrics for each period
period_metrics = {}
for period_key, records in period_groups.items():
if len(records) >= 3: # Minimum for meaningful metrics
scores = [r["average_score"] for r in records]
hire_rate = sum(r["hire_decision"] for r in records) / len(records)
period_metrics[period_key] = {
"count": len(records),
"mean_score": statistics.mean(scores),
"hire_rate": hire_rate,
"std_score": statistics.stdev(scores) if len(scores) > 1 else 0
}
if len(period_metrics) < 3:
return {"error": "Insufficient periods for trend analysis"}
# Analyze trends
sorted_periods = sorted(period_metrics.keys())
mean_scores = [period_metrics[p]["mean_score"] for p in sorted_periods]
hire_rates = [period_metrics[p]["hire_rate"] for p in sorted_periods]
# Simple linear trend calculation
score_trend = self._calculate_linear_trend(mean_scores)
hire_rate_trend = self._calculate_linear_trend(hire_rates)
return {
"period": period,
"total_periods": len(period_metrics),
"period_metrics": period_metrics,
"trends": {
"score_trend": {
"direction": "increasing" if score_trend > 0.01 else "decreasing" if score_trend < -0.01 else "stable",
"slope": round(score_trend, 4),
"significance": "significant" if abs(score_trend) > 0.05 else "minor"
},
"hire_rate_trend": {
"direction": "increasing" if hire_rate_trend > 0.005 else "decreasing" if hire_rate_trend < -0.005 else "stable",
"slope": round(hire_rate_trend, 4),
"significance": "significant" if abs(hire_rate_trend) > 0.02 else "minor"
}
},
"insights": self._generate_trend_insights(score_trend, hire_rate_trend, period_metrics)
}
def _calculate_linear_trend(self, values: List[float]) -> float:
"""Calculate simple linear trend slope."""
if len(values) < 2:
return 0
n = len(values)
x = list(range(n))
# Calculate slope using least squares
x_mean = statistics.mean(x)
y_mean = statistics.mean(values)
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
return numerator / denominator if denominator != 0 else 0
def _detect_score_inflation(self, scores: List[float]) -> Dict[str, Any]:
"""Detect if an interviewer shows score inflation patterns."""
if len(scores) < 5:
return {"insufficient_data": True}
mean_score = statistics.mean(scores)
std_score = statistics.stdev(scores)
# Check against expected mean (2.8)
expected_mean = self.calibration_standards["score_distribution"]["target_mean"]
deviation = mean_score - expected_mean
# High scores with low variance might indicate inflation
high_scores_low_variance = mean_score > 3.2 and std_score < 0.5
# Check distribution - too many 4s might indicate inflation
score_counts = Counter([int(score) for score in scores])
four_count_ratio = score_counts.get(4, 0) / len(scores)
return {
"mean_score": round(mean_score, 2),
"expected_mean": expected_mean,
"deviation": round(deviation, 2),
"high_scores_low_variance": high_scores_low_variance,
"four_count_ratio": round(four_count_ratio, 2),
"inflation_detected": deviation > 0.3 or high_scores_low_variance or four_count_ratio > 0.4
}
def _calculate_interviewer_consistency(self, records: List[Dict[str, Any]]) -> float:
"""Calculate consistency score for an interviewer."""
if len(records) < 3:
return 0.5 # Neutral score for insufficient data
# Look at variance in scoring
avg_scores = [r["average_score"] for r in records]
score_variance = statistics.variance(avg_scores)
# Look at decision consistency relative to scores
decisions = [r["hire_decision"] for r in records]
scores_of_hires = [r["average_score"] for r in records if r["hire_decision"]]
scores_of_no_hires = [r["average_score"] for r in records if not r["hire_decision"]]
# Good consistency means hires have higher average scores
decision_consistency = 0.5
if scores_of_hires and scores_of_no_hires:
hire_mean = statistics.mean(scores_of_hires)
no_hire_mean = statistics.mean(scores_of_no_hires)
score_gap = hire_mean - no_hire_mean
decision_consistency = min(1.0, max(0.0, score_gap / 2.0)) # Normalize to 0-1
# Combine metrics (lower variance = higher consistency)
variance_consistency = max(0.0, 1.0 - (score_variance / 2.0))
return (decision_consistency + variance_consistency) / 2
def _calculate_overall_interviewer_consistency(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Calculate overall consistency across all interviewers."""
interviewer_consistency_scores = []
interviewer_records = defaultdict(list)
for record in data:
interviewer_records[record["interviewer_id"]].append(record)
for interviewer_id, records in interviewer_records.items():
if len(records) >= 3:
consistency = self._calculate_interviewer_consistency(records)
interviewer_consistency_scores.append(consistency)
if not interviewer_consistency_scores:
return {"error": "Insufficient data per interviewer for consistency analysis"}
return {
"mean_consistency": round(statistics.mean(interviewer_consistency_scores), 3),
"std_consistency": round(statistics.stdev(interviewer_consistency_scores) if len(interviewer_consistency_scores) > 1 else 0, 3),
"min_consistency": round(min(interviewer_consistency_scores), 3),
"max_consistency": round(max(interviewer_consistency_scores), 3),
"interviewers_analyzed": len(interviewer_consistency_scores),
"target_threshold": self.bias_thresholds["interviewer_consistency_threshold"]
}
def _calculate_bias_score(self, bias_analysis: Dict[str, Any]) -> float:
"""Calculate overall bias score (0-1, where 1 is most biased)."""
bias_factors = []
# Demographic bias factors
demographic_bias = bias_analysis.get("demographic_bias", {})
for demo, analysis in demographic_bias.items():
if analysis.get("bias_detected"):
bias_factors.append(0.3) # Each demographic bias adds 0.3
# Interviewer bias factors
interviewer_bias = bias_analysis.get("interviewer_bias", {})
outlier_interviewers = interviewer_bias.get("outlier_interviewers", {})
if outlier_interviewers:
# Scale by severity and number of outliers
total_severity = sum(info["severity"] for info in outlier_interviewers.values())
bias_factors.append(min(0.5, total_severity * 0.1))
# Competency bias factors
competency_bias = bias_analysis.get("competency_bias", {})
for comp, analysis in competency_bias.items():
if analysis.get("bias_detected"):
bias_factors.append(0.2) # Each competency bias adds 0.2
return min(1.0, sum(bias_factors))
def _calculate_health_score(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
"""Calculate overall calibration health score."""
health_factors = []
# Bias score (lower is better)
bias_analysis = analysis.get("bias_analysis", {})
bias_score = bias_analysis.get("overall_bias_score", 0)
bias_health = max(0, 1 - bias_score)
health_factors.append(("bias", bias_health, 0.3))
# Calibration consistency
calibration_analysis = analysis.get("calibration_analysis", {})
if "calibration_quality" in calibration_analysis:
quality_map = {"good": 1.0, "fair": 0.7, "poor": 0.3}
calibration_health = quality_map.get(calibration_analysis["calibration_quality"], 0.5)
health_factors.append(("calibration", calibration_health, 0.25))
# Interviewer consistency
interviewer_analysis = analysis.get("interviewer_analysis", {})
overall_consistency = interviewer_analysis.get("overall_consistency", {})
if "mean_consistency" in overall_consistency:
consistency_health = overall_consistency["mean_consistency"]
health_factors.append(("interviewer_consistency", consistency_health, 0.25))
# Scoring patterns health
scoring_analysis = analysis.get("scoring_analysis", {})
if "overall_assessment" in scoring_analysis:
assessment_map = {"healthy": 1.0, "concerning": 0.6, "poor": 0.2}
scoring_health = assessment_map.get(scoring_analysis["overall_assessment"], 0.5)
health_factors.append(("scoring_patterns", scoring_health, 0.2))
# Calculate weighted average
if health_factors:
weighted_sum = sum(score * weight for _, score, weight in health_factors)
total_weight = sum(weight for _, _, weight in health_factors)
overall_score = weighted_sum / total_weight
else:
overall_score = 0.5 # Neutral if no data
# Categorize health
if overall_score >= 0.8:
health_category = "excellent"
elif overall_score >= 0.7:
health_category = "good"
elif overall_score >= 0.5:
health_category = "fair"
else:
health_category = "poor"
return {
"overall_score": round(overall_score, 3),
"health_category": health_category,
"component_scores": {name: round(score, 3) for name, score, _ in health_factors},
"improvement_priority": self._identify_improvement_priorities(health_factors)
}
def _identify_improvement_priorities(self, health_factors: List[Tuple[str, float, float]]) -> List[str]:
"""Identify areas that need the most improvement."""
priorities = []
for name, score, weight in health_factors:
impact = (1 - score) * weight # Low scores with high weights = high priority
if impact > 0.15: # Significant impact threshold
priorities.append(name)
# Sort by impact (highest first)
priorities.sort(key=lambda name: next((1 - score) * weight for n, score, weight in health_factors if n == name), reverse=True)
return priorities
def _generate_recommendations(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Generate actionable recommendations based on analysis results."""
recommendations = []
# Bias-related recommendations
bias_analysis = analysis.get("bias_analysis", {})
# Demographic bias recommendations
for demo, demo_analysis in bias_analysis.get("demographic_bias", {}).items():
if demo_analysis.get("bias_detected"):
recommendations.append({
"priority": "high",
"category": "bias_mitigation",
"title": f"Address {demo.replace('_', ' ').title()} Bias",
"description": demo_analysis.get("recommendation", f"Implement bias mitigation strategies for {demo}"),
"actions": [
"Conduct unconscious bias training focused on this demographic",
"Review and standardize interview questions",
"Implement diverse interview panels",
"Monitor hiring metrics by demographic group"
]
})
# Interviewer-specific recommendations
interviewer_analysis = bias_analysis.get("interviewer_bias", {})
outlier_interviewers = interviewer_analysis.get("outlier_interviewers", {})
for interviewer_id, outlier_info in outlier_interviewers.items():
issues = outlier_info["issues"]
priority = "high" if outlier_info["severity"] >= 3 else "medium"
actions = []
if "score_inflation" in issues:
actions.extend([
"Provide calibration training on scoring standards",
"Shadow experienced interviewers for recalibration",
"Review examples of each score level"
])
if "score_deflation" in issues:
actions.extend([
"Review expectations for role level",
"Calibrate against recent successful hires",
"Discuss evaluation criteria with hiring manager"
])
if "hire_rate_deviation" in issues:
actions.extend([
"Review hiring bar standards",
"Participate in calibration sessions",
"Compare decision criteria with team"
])
if "low_consistency" in issues:
actions.extend([
"Practice structured interviewing techniques",
"Use standardized scorecards",
"Document specific examples for each score"
])
recommendations.append({
"priority": priority,
"category": "interviewer_coaching",
"title": f"Coach Interviewer {interviewer_id}",
"description": f"Address issues: {', '.join(issues)}",
"actions": list(set(actions)) # Remove duplicates
})
# Calibration recommendations
calibration_analysis = analysis.get("calibration_analysis", {})
if calibration_analysis.get("calibration_quality") in ["fair", "poor"]:
recommendations.append({
"priority": "high",
"category": "calibration_improvement",
"title": "Improve Interview Calibration",
"description": f"Current calibration quality: {calibration_analysis.get('calibration_quality')}",
"actions": [
"Conduct monthly calibration sessions",
"Create shared examples of good/poor answers",
"Implement mandatory interviewer shadowing",
"Standardize scoring rubrics across all interviewers",
"Review and align on role expectations"
]
})
# Scoring pattern recommendations
scoring_analysis = analysis.get("scoring_analysis", {})
if scoring_analysis.get("overall_assessment") in ["concerning", "poor"]:
recommendations.append({
"priority": "medium",
"category": "scoring_standards",
"title": "Adjust Scoring Standards",
"description": "Scoring patterns deviate significantly from expected distribution",
"actions": [
"Review and communicate target score distributions",
"Provide examples for each score level",
"Monitor pass rates by role level",
"Adjust hiring bar if consistently too high/low"
]
})
# Health score recommendations
health_score = analysis.get("calibration_health_score", {})
priorities = health_score.get("improvement_priority", [])
if "bias" in priorities:
recommendations.append({
"priority": "critical",
"category": "bias_mitigation",
"title": "Implement Comprehensive Bias Mitigation",
"description": "Multiple bias indicators detected across the hiring process",
"actions": [
"Mandatory unconscious bias training for all interviewers",
"Implement structured interview protocols",
"Diversify interview panels",
"Regular bias audits and monitoring",
"Create accountability metrics for fair hiring"
]
})
# Sort by priority
priority_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3))
return recommendations
def _generate_demographic_bias_recommendation(self, demographic: str, bias_details: Dict[str, Any]) -> str:
"""Generate specific recommendation for demographic bias."""
if "hire_rate_disparity" in bias_details:
return f"Significant hire rate disparity detected for {demographic}. Implement structured interviews and diverse panels."
elif "scoring_disparity" in bias_details:
return f"Scoring disparity detected for {demographic}. Provide unconscious bias training and standardize evaluation criteria."
else:
return f"Potential bias detected for {demographic}. Monitor closely and implement bias mitigation strategies."
def _generate_interviewer_recommendations(self, outlier_interviewers: Dict[str, Any]) -> List[str]:
"""Generate recommendations for interviewer issues."""
if not outlier_interviewers:
return ["All interviewers performing within expected ranges"]
recommendations = []
for interviewer, info in outlier_interviewers.items():
issues = info["issues"]
if len(issues) >= 2:
recommendations.append(f"Interviewer {interviewer}: Requires comprehensive recalibration - multiple issues detected")
elif "score_inflation" in issues:
recommendations.append(f"Interviewer {interviewer}: Provide calibration training on scoring standards")
elif "hire_rate_deviation" in issues:
recommendations.append(f"Interviewer {interviewer}: Review hiring bar standards and decision criteria")
return recommendations
def _generate_calibration_recommendations(self, mean_std: float, agreement_rate: float) -> List[str]:
"""Generate calibration improvement recommendations."""
recommendations = []
if mean_std > self.calibration_standards["interviewer_agreement"]["maximum_std_deviation"]:
recommendations.append("High score variance detected - implement regular calibration sessions")
recommendations.append("Create shared examples of scoring standards for each competency")
if agreement_rate < self.calibration_standards["interviewer_agreement"]["agreement_threshold"]:
recommendations.append("Low interviewer agreement rate - standardize interview questions and evaluation criteria")
recommendations.append("Implement mandatory interviewer training on consistent evaluation")
if not recommendations:
recommendations.append("Calibration appears healthy - maintain current practices")
return recommendations
def _assess_scoring_health(self, distribution: Dict[str, Any], mean_score: float, target_mean: float) -> str:
"""Assess overall health of scoring patterns."""
issues = 0
# Check distribution deviations
for score_level, analysis in distribution.items():
if analysis["significant_deviation"]:
issues += 1
# Check mean deviation
if abs(mean_score - target_mean) > 0.3:
issues += 1
if issues == 0:
return "healthy"
elif issues <= 2:
return "concerning"
else:
return "poor"
def _generate_trend_insights(self, score_trend: float, hire_rate_trend: float, period_metrics: Dict[str, Any]) -> List[str]:
"""Generate insights from trend analysis."""
insights = []
if abs(score_trend) > 0.05:
direction = "increasing" if score_trend > 0 else "decreasing"
insights.append(f"Significant {direction} trend in average scores over time")
if score_trend > 0:
insights.append("May indicate score inflation or improving candidate quality")
else:
insights.append("May indicate stricter evaluation or declining candidate quality")
if abs(hire_rate_trend) > 0.02:
direction = "increasing" if hire_rate_trend > 0 else "decreasing"
insights.append(f"Significant {direction} trend in hire rates over time")
if hire_rate_trend > 0:
insights.append("Consider if hiring bar has lowered or candidate pool improved")
else:
insights.append("Consider if hiring bar has raised or candidate pool declined")
# Check for consistency
period_values = list(period_metrics.values())
hire_rates = [p["hire_rate"] for p in period_values]
hire_rate_variance = statistics.variance(hire_rates) if len(hire_rates) > 1 else 0
if hire_rate_variance > 0.01: # High variance in hire rates
insights.append("High variance in hire rates across periods - consider process standardization")
if not insights:
insights.append("Hiring patterns appear stable over time")
return insights
def _analyze_single_interviewer_consistency(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Analyze consistency for single-interviewer candidates."""
# Look at consistency within individual interviewers
interviewer_scores = defaultdict(list)
for record in data:
interviewer_scores[record["interviewer_id"]].extend(record["scores"].values())
consistency_analysis = {}
for interviewer, scores in interviewer_scores.items():
if len(scores) >= 10: # Need sufficient data
consistency_analysis[interviewer] = {
"mean_score": round(statistics.mean(scores), 2),
"std_score": round(statistics.stdev(scores), 2),
"coefficient_of_variation": round(statistics.stdev(scores) / statistics.mean(scores), 2),
"total_scores": len(scores)
}
return consistency_analysis
def format_human_readable(calibration_report: Dict[str, Any]) -> str:
"""Format calibration report in human-readable format."""
output = []
# Header
output.append("HIRING CALIBRATION ANALYSIS REPORT")
output.append("=" * 60)
output.append(f"Analysis Type: {calibration_report.get('analysis_type', 'N/A').title()}")
output.append(f"Generated: {calibration_report.get('generated_at', 'N/A')}")
if "error" in calibration_report:
output.append(f"\nError: {calibration_report['error']}")
return "\n".join(output)
# Data Summary
data_summary = calibration_report.get("data_summary", {})
if data_summary:
output.append(f"\nDATA SUMMARY")
output.append("-" * 30)
output.append(f"Total Candidates: {data_summary.get('total_candidates', 0)}")
output.append(f"Unique Interviewers: {data_summary.get('unique_interviewers', 0)}")
output.append(f"Overall Hire Rate: {data_summary.get('hire_rate', 0):.1%}")
score_stats = data_summary.get("score_statistics", {})
output.append(f"Average Score: {score_stats.get('mean_average_scores', 0):.2f}")
output.append(f"Score Std Dev: {score_stats.get('std_average_scores', 0):.2f}")
# Health Score
health_score = calibration_report.get("calibration_health_score", {})
if health_score:
output.append(f"\nCALIBRATION HEALTH SCORE")
output.append("-" * 30)
output.append(f"Overall Score: {health_score.get('overall_score', 0):.3f}")
output.append(f"Health Category: {health_score.get('health_category', 'Unknown').title()}")
if health_score.get("improvement_priority"):
output.append(f"Priority Areas: {', '.join(health_score['improvement_priority'])}")
# Bias Analysis
bias_analysis = calibration_report.get("bias_analysis", {})
if bias_analysis:
output.append(f"\nBIAS ANALYSIS")
output.append("-" * 30)
output.append(f"Overall Bias Score: {bias_analysis.get('overall_bias_score', 0):.3f}")
# Demographic bias
demographic_bias = bias_analysis.get("demographic_bias", {})
if demographic_bias:
output.append(f"\nDemographic Bias Issues:")
for demo, analysis in demographic_bias.items():
output.append(f"{demo.replace('_', ' ').title()}: {analysis.get('bias_details', {}).keys()}")
# Interviewer bias
interviewer_bias = bias_analysis.get("interviewer_bias", {})
outlier_interviewers = interviewer_bias.get("outlier_interviewers", {})
if outlier_interviewers:
output.append(f"\nOutlier Interviewers:")
for interviewer, info in outlier_interviewers.items():
issues = ", ".join(info["issues"])
output.append(f"{interviewer}: {issues}")
# Calibration Analysis
calibration_analysis = calibration_report.get("calibration_analysis", {})
if calibration_analysis and "error" not in calibration_analysis:
output.append(f"\nCALIBRATION CONSISTENCY")
output.append("-" * 30)
output.append(f"Quality: {calibration_analysis.get('calibration_quality', 'Unknown').title()}")
output.append(f"Agreement Rate: {calibration_analysis.get('agreement_within_one_point_rate', 0):.1%}")
output.append(f"Score Std Dev: {calibration_analysis.get('mean_score_standard_deviation', 0):.3f}")
# Scoring Analysis
scoring_analysis = calibration_report.get("scoring_analysis", {})
if scoring_analysis:
output.append(f"\nSCORING PATTERNS")
output.append("-" * 30)
output.append(f"Overall Assessment: {scoring_analysis.get('overall_assessment', 'Unknown').title()}")
score_stats = scoring_analysis.get("score_statistics", {})
output.append(f"Mean Score: {score_stats.get('mean_score', 0):.2f} (Target: {score_stats.get('target_mean', 0):.2f})")
# Distribution analysis
distribution = scoring_analysis.get("score_distribution", {})
if distribution:
output.append(f"\nScore Distribution vs Expected:")
for score in ["1", "2", "3", "4"]:
if score in distribution:
actual = distribution[score]["actual_percentage"]
expected = distribution[score]["expected_percentage"]
output.append(f" Score {score}: {actual:.1%} (Expected: {expected:.1%})")
# Top Recommendations
recommendations = calibration_report.get("recommendations", [])
if recommendations:
output.append(f"\nTOP RECOMMENDATIONS")
output.append("-" * 30)
for i, rec in enumerate(recommendations[:5], 1): # Show top 5
output.append(f"{i}. {rec['title']} ({rec['priority'].title()} Priority)")
output.append(f" {rec['description']}")
if rec.get('actions'):
output.append(f" Actions: {len(rec['actions'])} specific action items")
return "\n".join(output)
def main():
parser = argparse.ArgumentParser(description="Analyze interview data for bias and calibration issues")
parser.add_argument("--input", type=str, required=True, help="Input JSON file with interview results data")
parser.add_argument("--analysis-type", type=str, choices=["comprehensive", "bias", "calibration", "interviewer", "scoring"],
default="comprehensive", help="Type of analysis to perform")
parser.add_argument("--competencies", type=str, help="Comma-separated list of competencies to focus on")
parser.add_argument("--trend-analysis", action="store_true", help="Perform trend analysis over time")
parser.add_argument("--period", type=str, choices=["daily", "weekly", "monthly", "quarterly"],
default="monthly", help="Time period for trend analysis")
parser.add_argument("--output", type=str, help="Output file path")
parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format")
args = parser.parse_args()
# Load input data
try:
with open(args.input, 'r') as f:
interview_data = json.load(f)
if not isinstance(interview_data, list):
print("Error: Input data must be a JSON array of interview records")
sys.exit(1)
except FileNotFoundError:
print(f"Error: Input file '{args.input}' not found")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in input file: {e}")
sys.exit(1)
except Exception as e:
print(f"Error reading input file: {e}")
sys.exit(1)
# Initialize calibrator and run analysis
calibrator = HiringCalibrator()
competencies = args.competencies.split(',') if args.competencies else None
try:
results = calibrator.analyze_hiring_calibration(
interview_data=interview_data,
analysis_type=args.analysis_type,
competencies=competencies,
trend_analysis=args.trend_analysis,
period=args.period
)
# Handle output
if args.output:
output_path = args.output
json_path = output_path if output_path.endswith('.json') else f"{output_path}.json"
text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt"
else:
base_filename = f"calibration_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
json_path = f"{base_filename}.json"
text_path = f"{base_filename}.txt"
# Write outputs
if args.format in ["json", "both"]:
with open(json_path, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"JSON report written to: {json_path}")
if args.format in ["text", "both"]:
with open(text_path, 'w') as f:
f.write(format_human_readable(results))
print(f"Text report written to: {text_path}")
# Print summary
print(f"\nCalibration Analysis Summary:")
if "error" in results:
print(f"Error: {results['error']}")
else:
health_score = results.get("calibration_health_score", {})
print(f"Health Score: {health_score.get('overall_score', 0):.3f} ({health_score.get('health_category', 'Unknown').title()})")
bias_score = results.get("bias_analysis", {}).get("overall_bias_score", 0)
print(f"Bias Score: {bias_score:.3f} (Lower is better)")
recommendations = results.get("recommendations", [])
print(f"Recommendations Generated: {len(recommendations)}")
if recommendations:
print(f"Top Priority: {recommendations[0]['title']} ({recommendations[0]['priority'].title()})")
except Exception as e:
print(f"Error during analysis: {e}")
sys.exit(1)
if __name__ == "__main__":
main()