import sys
sys.path.insert(0, "../src")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.calibration import calibration_curve
import xgboost as xgb
import torch
import torch.nn as nn
import json
import pickle
import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
plt.rcParams["figure.dpi"] = 120

from features import load_raw, engineer_target, engineer_features, select_model_features
from train import DefaultNet

raw = load_raw()
print(f"Total loan records: {len(raw):,}")
print(f"Unique loans (by assetNumber): {raw['assetNumber'].nunique():,}")
print(f"Filings: {raw['filing_date'].nunique()}")
print(f"Reporting periods: {raw['period_ending'].unique().tolist()}")
print(f"\nColumns ({len(raw.columns)}):")
print(raw.dtypes.to_string())

Total loan records: 168,643
Unique loans (by assetNumber): 168,643
Filings: 3
Reporting periods: ['2024-06-30', '2024-05-31', '2024-11-30']

Columns (40):
assetNumber                                      str
reportingPeriodBeginningDate                     str
reportingPeriodEndingDate                        str
originatorName                                   str
originationDate                                  str
originalLoanAmount                           float64
originalLoanTerm                               int64
loanMaturityDate                                 str
originalInterestRatePercentage               float64
originalInterestRateTypeCode                     str
vehicleManufacturerName                          str
vehicleModelName                                 str
vehicleNewUsedCode                               str
vehicleModelYear                                 str
vehicleTypeCode                                  str
vehicleValueAmount                           float64
obligorCreditScore                           float64
obligorCreditScoreType                           str
coObligorIndicator                               str
paymentToIncomePercentage                    float64
obligorGeographicLocation                        str
remainingTermToMaturityNumber                float64
reportingPeriodBeginningLoanBalanceAmount    float64
reportingPeriodActualEndBalanceAmount        float64
reportingPeriodScheduledPaymentAmount        float64
totalActualAmountPaid                        float64
actualInterestCollectedAmount                float64
actualPrincipalCollectedAmount               float64
reportingPeriodInterestRatePercentage        float64
scheduledInterestAmount                      float64
scheduledPrincipalAmount                     float64
currentDelinquencyStatus                     float64
repossessedIndicator                             str
chargedoffPrincipalAmount                    float64
recoveredAmount                              float64
zeroBalanceCode                                  str
primaryLoanServicerName                          str
subvented                                        str
filing_date                                      str
period_ending                                    str

fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Credit score distribution
axes[0, 0].hist(raw["obligorCreditScore"].dropna(), bins=60, color="#4C72B0", edgecolor="white", alpha=0.85)
axes[0, 0].set_xlabel("Credit Score")
axes[0, 0].set_ylabel("Count")
axes[0, 0].set_title("Credit Score Distribution")
axes[0, 0].axvline(x=660, color="red", linestyle="--", alpha=0.7, label="Subprime threshold (660)")
axes[0, 0].legend(fontsize=9)

# Original loan amount
axes[0, 1].hist(raw["originalLoanAmount"].dropna(), bins=60, color="#55A868", edgecolor="white", alpha=0.85)
axes[0, 1].set_xlabel("Original Loan Amount ($)")
axes[0, 1].set_ylabel("Count")
axes[0, 1].set_title("Loan Amount Distribution")
axes[0, 1].xaxis.set_major_formatter(mtick.StrMethodFormatter("${x:,.0f}"))

# Interest rate
axes[0, 2].hist(raw["originalInterestRatePercentage"].dropna() * 100, bins=50, color="#C44E52", edgecolor="white", alpha=0.85)
axes[0, 2].set_xlabel("Interest Rate (%)")
axes[0, 2].set_ylabel("Count")
axes[0, 2].set_title("Interest Rate Distribution")

# Original loan term
term_counts = raw["originalLoanTerm"].value_counts().sort_index()
axes[1, 0].bar(term_counts.index, term_counts.values, color="#8172B2", edgecolor="white", alpha=0.85)
axes[1, 0].set_xlabel("Loan Term (months)")
axes[1, 0].set_ylabel("Count")
axes[1, 0].set_title("Loan Term Distribution")

# Vehicle type
vtype_map = {"1": "Car", "2": "Truck", "3": "SUV", "4": "Motorcycle"}
vtype_counts = raw["vehicleTypeCode"].map(vtype_map).value_counts()
axes[1, 1].bar(vtype_counts.index, vtype_counts.values, color="#CCB974", edgecolor="white", alpha=0.85)
axes[1, 1].set_xlabel("Vehicle Type")
axes[1, 1].set_ylabel("Count")
axes[1, 1].set_title("Vehicle Type Distribution")

# New vs Used
nu_map = {"1": "New", "2": "Used"}
nu_counts = raw["vehicleNewUsedCode"].map(nu_map).value_counts()
axes[1, 2].bar(nu_counts.index, nu_counts.values, color="#64B5CD", edgecolor="white", alpha=0.85)
axes[1, 2].set_xlabel("Vehicle Condition")
axes[1, 2].set_ylabel("Count")
axes[1, 2].set_title("New vs Used")

plt.tight_layout()
plt.savefig("../figures/eda_distributions.png", bbox_inches="tight")
plt.show()

# Geographic distribution - top 15 states
state_counts = raw["obligorGeographicLocation"].value_counts().head(15)

fig, ax = plt.subplots(figsize=(12, 5))
ax.barh(state_counts.index[::-1], state_counts.values[::-1], color="#4C72B0", edgecolor="white")
ax.set_xlabel("Number of Loans")
ax.set_title("Loan Volume by State (Top 15)")
for i, v in enumerate(state_counts.values[::-1]):
    ax.text(v + 200, i, f"{v:,}", va="center", fontsize=9)
plt.tight_layout()
plt.savefig("../figures/geographic_distribution.png", bbox_inches="tight")
plt.show()

import plotly.express as px
import plotly.io as pio

# Aggregate by state
state_stats = raw.copy()
state_stats = engineer_target(state_stats)
state_agg = state_stats.groupby("obligorGeographicLocation").agg(
    loan_count=("assetNumber", "count"),
    default_rate=("default", "mean"),
    avg_credit_score=("obligorCreditScore", "mean"),
    avg_loan_amount=("originalLoanAmount", "mean"),
).reset_index()
state_agg.columns = ["state", "loan_count", "default_rate", "avg_credit_score", "avg_loan_amount"]

# Loan volume map
fig1 = px.choropleth(
    state_agg, locations="state", locationmode="USA-states",
    color="loan_count", color_continuous_scale="Blues",
    scope="usa", title="Loan Volume by State",
    labels={"loan_count": "Loans"},
)
fig1.update_layout(margin=dict(l=0, r=0, t=40, b=0), width=800, height=450)
fig1.write_image("../figures/map_loan_volume.png", scale=2)
fig1.write_html("../docs/map_loan_volume.html", include_plotlyjs="cdn")
fig1.show()

# Default rate map
fig2 = px.choropleth(
    state_agg, locations="state", locationmode="USA-states",
    color="default_rate", color_continuous_scale="RdYlGn_r",
    scope="usa", title="Default Rate by State",
    labels={"default_rate": "Default Rate"},
    range_color=[0, state_agg["default_rate"].quantile(0.95)],
)
fig2.update_layout(margin=dict(l=0, r=0, t=40, b=0), width=800, height=450)
fig2.write_image("../figures/map_default_rate.png", scale=2)
fig2.write_html("../docs/map_default_rate.html", include_plotlyjs="cdn")
fig2.show()

df = engineer_target(raw.copy())

print(f"Default rate: {df['default'].mean():.4f} ({df['default'].sum():,} / {len(df):,})")
print(f"\nDefault composition:")
print(f"  60+ DPD:      {df['is_delinquent_60plus'].sum():>6,}")
print(f"  Repossessed:  {df['is_repossessed'].sum():>6,}")
print(f"  Charged off:  {df['is_chargedoff'].sum():>6,}")
print(f"  Total unique: {df['default'].sum():>6,}")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Default rate
labels = ["Performing", "Default"]
sizes = [1 - df["default"].mean(), df["default"].mean()]
colors = ["#55A868", "#C44E52"]
axes[0].pie(sizes, labels=labels, colors=colors, autopct="%1.1f%%", startangle=90, textprops={"fontsize": 12})
axes[0].set_title("Default Rate")

# Delinquency distribution
delinq = df["currentDelinquencyStatus"].fillna(0).clip(upper=6)
delinq_counts = delinq.value_counts().sort_index()
delinq_labels = {0: "Current", 1: "30 DPD", 2: "60 DPD", 3: "90 DPD", 4: "120 DPD", 5: "150 DPD", 6: "180+ DPD"}
axes[1].bar(
    [delinq_labels.get(k, str(k)) for k in delinq_counts.index],
    delinq_counts.values,
    color=["#55A868"] + ["#4C72B0"] + ["#C44E52"] * 5,
    edgecolor="white"
)
axes[1].set_ylabel("Count")
axes[1].set_title("Delinquency Status Distribution")
axes[1].set_yscale("log")

plt.tight_layout()
plt.savefig("../figures/target_distribution.png", bbox_inches="tight")
plt.show()

Default rate: 0.0618 (10,416 / 168,643)

Default composition:
  60+ DPD:       9,392
  Repossessed:     972
  Charged off:     113
  Total unique: 10,416

df = engineer_features(df)
model_df, feature_cols, target_col = select_model_features(df)
print(f"Final dataset: {model_df.shape[0]:,} rows x {len(feature_cols)} features")
print(f"Default rate: {model_df[target_col].mean():.4f}")
print(f"\nFeature summary statistics:")
model_df[feature_cols].describe().round(4).T

Final dataset: 163,454 rows x 22 features
Default rate: 0.0617

Feature summary statistics:

# Correlation of features with default
corr_with_target = model_df[feature_cols + [target_col]].corr()[target_col].drop(target_col).sort_values()

fig, ax = plt.subplots(figsize=(10, 8))
colors = ["#C44E52" if v > 0 else "#4C72B0" for v in corr_with_target.values]
ax.barh(corr_with_target.index, corr_with_target.values, color=colors, edgecolor="white")
ax.set_xlabel("Pearson Correlation with Default")
ax.set_title("Feature Correlation with Default")
ax.axvline(x=0, color="black", linewidth=0.5)
plt.tight_layout()
plt.savefig("../figures/feature_correlation.png", bbox_inches="tight")
plt.show()

# Default rate by credit score bin and vehicle type
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# By credit tier
tier_default = model_df.groupby("fico_bin", observed=True)[target_col].agg(["mean", "count"])
tier_default = tier_default[tier_default["count"] >= 50]
axes[0].bar(tier_default.index.astype(str), tier_default["mean"] * 100, color="#C44E52", edgecolor="white", alpha=0.85)
axes[0].set_xlabel("Credit Tier")
axes[0].set_ylabel("Default Rate (%)")
axes[0].set_title("Default Rate by Credit Tier")
axes[0].tick_params(axis="x", rotation=45)
for i, (idx, row) in enumerate(tier_default.iterrows()):
    axes[0].text(i, row["mean"] * 100 + 0.2, f"n={row['count']:,.0f}", ha="center", fontsize=8)

# By state (top 15 by volume)
top_states = model_df["obligorGeographicLocation"].value_counts().head(15).index
state_default = model_df[model_df["obligorGeographicLocation"].isin(top_states)].groupby("obligorGeographicLocation")[target_col].mean().sort_values(ascending=False)
axes[1].barh(state_default.index[::-1], state_default.values[::-1] * 100, color="#4C72B0", edgecolor="white")
axes[1].set_xlabel("Default Rate (%)")
axes[1].set_title("Default Rate by State (Top 15 by Volume)")

plt.tight_layout()
plt.savefig("../figures/default_by_segment.png", bbox_inches="tight")
plt.show()

X = model_df[feature_cols]
y = model_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")
print(f"Train default rate: {y_train.mean():.4f} | Test default rate: {y_test.mean():.4f}")

scaler = StandardScaler()
scaler.fit(X_train)

# Load pre-trained models
with open("../models/logistic.pkl", "rb") as f:
    lr_bundle = pickle.load(f)
lr_model = lr_bundle["model"]

xgb_model = xgb.XGBClassifier()
xgb_model.load_model("../models/xgboost.json")

nn_model = DefaultNet(len(feature_cols))
nn_state = torch.load("../models/neural_net.pt", weights_only=False)
nn_model.load_state_dict(nn_state["model_state"])
nn_model.eval()

# Generate predictions
X_test_scaled = scaler.transform(X_test)
lr_probs = lr_model.predict_proba(X_test_scaled)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

with torch.no_grad():
    nn_probs = torch.sigmoid(nn_model(torch.FloatTensor(np.array(X_test_scaled, dtype=np.float32)))).squeeze().numpy()

print("\nPredictions generated for all three models.")

Train: 130,763 | Test: 32,691
Train default rate: 0.0617 | Test default rate: 0.0617

Predictions generated for all three models.

def ks_statistic(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    return max(tpr - fpr)

# Results table
results = []
for name, probs in [("Logistic Regression", lr_probs), ("XGBoost", xgb_probs), ("Neural Network", nn_probs)]:
    auc = roc_auc_score(y_test, probs)
    ks = ks_statistic(y_test, probs)
    gini = 2 * auc - 1
    results.append({"Model": name, "AUC": f"{auc:.4f}", "KS Statistic": f"{ks:.4f}", "Gini": f"{gini:.4f}"})

results_df = pd.DataFrame(results)
results_df.style.set_caption("Model Performance Comparison (Test Set)")

# ROC Curves
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# ROC
colors = {"Logistic Regression": "#4C72B0", "XGBoost": "#55A868", "Neural Network": "#C44E52"}
for name, probs in [("Logistic Regression", lr_probs), ("XGBoost", xgb_probs), ("Neural Network", nn_probs)]:
    fpr, tpr, _ = roc_curve(y_test, probs)
    auc = roc_auc_score(y_test, probs)
    axes[0].plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})", color=colors[name], linewidth=2)
axes[0].plot([0, 1], [0, 1], "k--", alpha=0.3)
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].set_title("ROC Curves")
axes[0].legend(fontsize=9)

# KS Plot (using XGBoost as the best traditional model)
fpr, tpr, thresholds = roc_curve(y_test, xgb_probs)
ks_val = max(tpr - fpr)
ks_idx = np.argmax(tpr - fpr)
axes[1].plot(np.linspace(0, 1, len(tpr)), tpr, label="Cumulative Default (TPR)", color="#C44E52", linewidth=2)
axes[1].plot(np.linspace(0, 1, len(fpr)), fpr, label="Cumulative Non-Default (FPR)", color="#4C72B0", linewidth=2)
x_ks = np.linspace(0, 1, len(tpr))[ks_idx]
axes[1].vlines(x_ks, fpr[ks_idx], tpr[ks_idx], color="black", linestyle="--", label=f"KS = {ks_val:.3f}")
axes[1].set_xlabel("Score Percentile")
axes[1].set_ylabel("Cumulative Proportion")
axes[1].set_title("KS Plot (XGBoost)")
axes[1].legend(fontsize=9)

# Precision-Recall
for name, probs in [("Logistic Regression", lr_probs), ("XGBoost", xgb_probs), ("Neural Network", nn_probs)]:
    precision, recall, _ = precision_recall_curve(y_test, probs)
    axes[2].plot(recall, precision, label=name, color=colors[name], linewidth=2)
axes[2].axhline(y=y_test.mean(), color="gray", linestyle="--", alpha=0.5, label=f"Baseline ({y_test.mean():.3f})")
axes[2].set_xlabel("Recall")
axes[2].set_ylabel("Precision")
axes[2].set_title("Precision-Recall Curves")
axes[2].legend(fontsize=9)

plt.tight_layout()
plt.savefig("../figures/model_comparison.png", bbox_inches="tight")
plt.show()

# Feature importance (XGBoost)
importance = pd.Series(xgb_model.feature_importances_, index=feature_cols).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
importance.plot.barh(ax=ax, color="#55A868", edgecolor="white")
ax.set_xlabel("Feature Importance (Gain)")
ax.set_title("XGBoost Feature Importance")
plt.tight_layout()
plt.savefig("../figures/feature_importance.png", bbox_inches="tight")
plt.show()

# Calibration curves - how well do predicted probabilities match observed frequencies?
fig, ax = plt.subplots(figsize=(8, 7))

for name, probs, color in [
    ("Logistic Regression", lr_probs, "#4C72B0"),
    ("XGBoost", xgb_probs, "#55A868"),
    ("Neural Network", nn_probs, "#C44E52"),
]:
    prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=15, strategy="quantile")
    ax.plot(prob_pred, prob_true, marker="o", label=name, color=color, linewidth=2, markersize=5)

ax.plot([0, 1], [0, 1], "k--", alpha=0.3, label="Perfect calibration")
ax.set_xlabel("Mean Predicted Probability")
ax.set_ylabel("Observed Default Frequency")
ax.set_title("Calibration Curves")
ax.legend()
plt.tight_layout()
plt.savefig("../figures/calibration_curves.png", bbox_inches="tight")
plt.show()

# Geographic fairness using logistic regression (most interpretable model for fairness)
test_df = model_df.loc[X_test.index].copy()
test_df["pred_prob"] = lr_probs

# Per-state metrics
state_counts = test_df["obligorGeographicLocation"].value_counts()
valid_states = state_counts[state_counts >= 50].index

state_metrics = []
for state in valid_states:
    sdf = test_df[test_df["obligorGeographicLocation"] == state]
    denial_rate = (sdf["pred_prob"] >= 0.5).mean()
    actual_default = sdf["default"].mean()
    state_metrics.append({"state": state, "n": len(sdf), "denial_rate": denial_rate, "actual_default": actual_default})

state_df = pd.DataFrame(state_metrics).sort_values("denial_rate", ascending=False)
reference_rate = state_df["denial_rate"].median()
state_df["di_ratio"] = state_df["denial_rate"] / reference_rate
state_df["flagged"] = (state_df["di_ratio"] < 0.8) | (state_df["di_ratio"] > 1.25)

print(f"Reference denial rate (median state): {reference_rate:.4f}")
print(f"States flagged: {state_df['flagged'].sum()} / {len(state_df)}")
print(f"\nFlagged states:")
print(state_df[state_df["flagged"]][["state", "n", "denial_rate", "actual_default", "di_ratio"]].to_string(index=False))

Reference denial rate (median state): 0.1588
States flagged: 14 / 49

Flagged states:
state    n  denial_rate  actual_default  di_ratio
   MS  343     0.282799        0.084548  1.780776
   NM  119     0.235294        0.067227  1.481640
   ND   84     0.226190        0.083333  1.424315
   NV  230     0.217391        0.073913  1.368906
   OK  387     0.214470        0.046512  1.350513
   GA  933     0.204716        0.080386  1.289090
   WV  223     0.201794        0.080717  1.270689
   DE  143     0.125874        0.020979  0.792626
   IA  311     0.122186        0.054662  0.769405
   OR  257     0.120623        0.035019  0.759557
   NJ 1517     0.114700        0.048121  0.722263
   MT  113     0.106195        0.035398  0.668705
   KS  368     0.105978        0.019022  0.667342
   MO 1107     0.105691        0.043360  0.665533

# Visualize DI ratios
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# DI ratio by state
top_20 = state_df.head(20).sort_values("di_ratio")
bar_colors = ["#C44E52" if f else "#4C72B0" for f in top_20["flagged"]]
axes[0].barh(top_20["state"], top_20["di_ratio"], color=bar_colors, edgecolor="white")
axes[0].axvline(x=0.8, color="red", linestyle="--", alpha=0.6, label="Lower bound (0.8)")
axes[0].axvline(x=1.25, color="red", linestyle="--", alpha=0.6, label="Upper bound (1.25)")
axes[0].axvline(x=1.0, color="gray", linestyle="-", alpha=0.3)
axes[0].set_xlabel("Disparate Impact Ratio")
axes[0].set_title("Disparate Impact Ratio by State (Top 20 by volume)")
axes[0].legend(fontsize=9)

# Actual default vs predicted denial rate
axes[1].scatter(state_df["actual_default"], state_df["denial_rate"],
                c=state_df["flagged"].map({True: "#C44E52", False: "#4C72B0"}),
                s=state_df["n"] / 10, alpha=0.7, edgecolors="white")
axes[1].plot([0, state_df["actual_default"].max()], [0, state_df["actual_default"].max()],
             "k--", alpha=0.3, label="Perfect alignment")
axes[1].set_xlabel("Actual Default Rate")
axes[1].set_ylabel("Model Denial Rate")
axes[1].set_title("Actual Default vs Model Denial Rate by State\n(size = loan volume, red = flagged)")
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.savefig("../figures/fairness_analysis.png", bbox_inches="tight")
plt.show()

# Disparate impact map
fig3 = px.choropleth(
    state_df, locations="state", locationmode="USA-states",
    color="di_ratio", color_continuous_scale="RdBu_r",
    color_continuous_midpoint=1.0,
    scope="usa", title="Disparate Impact Ratio by State (1.0 = neutral, red = higher denial rate)",
    labels={"di_ratio": "DI Ratio"},
    range_color=[0.5, 1.8],
)
fig3.update_layout(margin=dict(l=0, r=0, t=40, b=0), width=800, height=450)
fig3.write_image("../figures/map_disparate_impact.png", scale=2)
fig3.write_html("../docs/map_disparate_impact.html", include_plotlyjs="cdn")
fig3.show()

Feature	Formula	Intuition
LTV Ratio	$\text{LTV}_i = \frac{\text{LoanAmount}_i}{\text{VehicleValue}_i}$	Higher LTV = more underwater risk
Balance Ratio	$\beta_i = \frac{\text{CurrentBalance}_i}{\text{OriginalAmount}_i}$	How much principal remains
Payment Shortfall	$\max(0,\; \text{Scheduled}_i - \text{ActualPaid}_i)$	Dollar amount of underpayment
Seasoning	$s_i = \frac{\text{OriginalTerm}_i - \text{RemainingTerm}_i}{\text{OriginalTerm}_i}$	Loan maturity progress
Rate Spread	$r_i - \tilde{r}$	Deviation from median portfolio rate
Vehicle Age at Origination	$\text{OrigYear}_i - \text{ModelYear}_i$	Depreciation exposure

	count	mean	std	min	25%	50%	75%	max
obligorCreditScore	163454.0	721.9396	74.2322	193.0000	680.0000	716.0000	769.0000	999.0000
originalLoanAmount	163454.0	27579.4240	13759.2939	4000.0000	18428.2800	24776.1650	33366.5525	215736.2400
originalLoanTerm	163454.0	70.6090	9.5114	13.0000	62.0000	73.0000	76.0000	92.0000
originalInterestRatePercentage	163454.0	0.0772	0.0270	0.0000	0.0599	0.0724	0.0919	0.2500
reportingPeriodInterestRatePercentage	163454.0	0.0778	0.0237	0.0000	0.0649	0.0744	0.0869	0.2500
vehicleValueAmount	163454.0	30489.3098	15248.3133	4750.0000	19932.2500	26998.0000	37275.0000	330646.0000
paymentToIncomePercentage	163454.0	0.0698	0.0469	0.0000	0.0375	0.0610	0.0935	0.4736
remainingTermToMaturityNumber	163454.0	38.2417	15.3922	0.0000	31.0000	40.0000	47.0000	83.0000
reportingPeriodBeginningLoanBalanceAmount	163454.0	15983.2430	10231.4786	-4121.2300	10343.1200	14417.1000	18950.5200	187199.1500
reportingPeriodActualEndBalanceAmount	163454.0	15360.2716	10213.0216	-8190.6200	9772.2700	13882.7000	18426.9300	184673.0100
ltv_ratio	163454.0	0.9320	0.1968	0.2453	0.8167	0.9714	1.0869	1.2050
payment_burden	163454.0	0.0180	0.0028	0.0000	0.0169	0.0176	0.0187	0.0878
balance_ratio	163454.0	0.5709	0.2198	-0.3290	0.4825	0.6046	0.6954	0.9970
payment_shortfall	163454.0	26.1457	122.7786	0.0000	0.0000	0.0000	0.0000	2639.5100
underpaid	163454.0	0.0709	0.2566	0.0000	0.0000	0.0000	0.0000	1.0000
months_elapsed	163454.0	32.9392	14.7983	1.0000	26.0000	31.0000	38.0000	86.0000
seasoning_pct	163454.0	0.4653	0.2034	0.0137	0.3562	0.4384	0.5410	1.0000
rate_spread	163454.0	0.0038	0.0270	-0.0734	-0.0135	-0.0010	0.0185	0.1766
vehicle_age_at_orig	163454.0	2.7459	2.8686	-1.0000	0.0000	2.0000	5.0000	21.0000
is_used_vehicle	163454.0	0.6491	0.4773	0.0000	0.0000	1.0000	1.0000	1.0000
has_coborrower	163454.0	0.3608	0.4802	0.0000	0.0000	0.0000	1.0000	1.0000
is_subvented	163454.0	0.2073	0.4053	0.0000	0.0000	0.0000	0.0000	1.0000

	Model	AUC	KS Statistic	Gini
0	Logistic Regression	0.8749	0.5902	0.7498
1	XGBoost	0.9219	0.6935	0.8438
2	Neural Network	0.9558	0.8022	0.9116

Auto Loan Default Prediction Using SEC EDGAR ABS-EE Data¶

1. Data Overview¶

2. Exploratory Data Analysis¶

3. Target Variable: Defining Default¶

4. Feature Engineering¶

5. Model Training and Comparison¶

6. Fairness Analysis¶

7. Conclusions¶

Appendix A: Mathematical Foundations¶

A.1 Logistic Regression¶

A.2 Gradient Boosted Trees (XGBoost)¶

A.3 Neural Network Architecture¶

A.4 Evaluation Metrics¶

A.5 Disparate Impact Analysis¶