Remove redundant docstrings and clean comments in compute_ratings.py

- Simplify function docstrings for load_games, aggregate_team_stats, and others - Keep key explanatory comments concise within code blocks - Maintain overall code clarity while reducing verbosity
2025-08-28 15:18:06 -05:00
parent 8cea48457f
commit 6fe14eed59
1 changed files with 13 additions and 160 deletions
--- a/compute_ratings.py
+++ b/compute_ratings.py
@@ -29,20 +29,8 @@ def load_games(
    team_id: str = "names",
    final_status: str | None = None,
 ) -> pd.DataFrame:
    """
    Load input CSV (season_schedule.csv) into a cleaned DataFrame with consistent columns.
    Parameters:
    - inp: CSV path to read
    - team_id: 'names' or 'slugs' to identify teams
    - final_status: if given, filter rows with status matching this (e.g. 'final')
    Returns:
    DataFrame with columns Date, HomeTeam, AwayTeam, HomeRuns, AwayRuns, Margin, Result
    """
    df = pd.read_csv(inp)
    # Choose identifiers
    # Determine team ID columns based on input param
    home_id_col = "home_name" if team_id == "names" else "home_slug"
    away_id_col = "away_name" if team_id == "names" else "away_slug"
    for c in [home_id_col, away_id_col, "home_runs", "away_runs"]:
@@ -50,31 +38,27 @@ def load_games(
            raise ValueError(f"Missing required column: {c}")
    # Optional status filter (helps exclude postponed/canceled)
    # Filter for final_status if provided to exclude e.g. postponed games
    if final_status is not None and "status" in df.columns:
        df = df[df["status"].astype(str).str.lower() == str(final_status).lower()]
-    # Convert run columns to numeric, drop rows with missing runs or teams
+    # Keep only games with numeric scores
    df = df.copy()
    df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce")
    df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce")
    df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"])
-
+    # Parse datetime (robust to missing either field)
    # Parse datetime by combining date_local and time_local if possible
    date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce")
    time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time
-
+    # Combine when possible
    dt = date
    if "time_local" in df.columns:
-
+        # build datetime only where both present
        # Build datetime where both date and time present
        dt = pd.to_datetime(
            date.dt.strftime("%Y-%m-%d").fillna("") + " " +
            pd.Series(time).astype(str).replace("NaT",""),
            errors="coerce"
        )
    # Construct cleaned DataFrame with fixed column names
    df_out = pd.DataFrame({
        "Date": dt,
        "HomeTeam": df[home_id_col].astype(str),
@@ -82,35 +66,19 @@ def load_games(
        "HomeRuns": df["home_runs"].astype(int),
        "AwayRuns": df["away_runs"].astype(int),
    })
    # Margin is difference in runs (home - away)
    df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"]
    # Result: 'H' if home win, 'A' if away win, 'T' for tie
    df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H",
                         np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T"))
    return df_out.reset_index(drop=True)
 def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate game-level data into team-level season stats: wins, losses, ties, runs scored,
    runs allowed, games played, win percentage, and run differential.
    Parameters:
    - df: DataFrame with game results
    Returns:
    DataFrame indexed by Team with aggregated stats
    """
    # Collect all team names from home and away columns
    teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team")
    # Initialize stats DataFrame with W/L/T/RS/RA all zero
    stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0)
    for _, r in df.iterrows():
        h, a = r["HomeTeam"], r["AwayTeam"]
        hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
        # Update runs scored and allowed for both teams
        stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar
        stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr
        # Update win/loss/tie counts
        if hr > ar:
            stats.at[h,"W"] += 1; stats.at[a,"L"] += 1
        elif hr < ar:
@@ -118,57 +86,22 @@ def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
        else:
            stats.at[h,"T"] += 1; stats.at[a,"T"] += 1
    stats = stats.astype(int)
    # Games played
    stats["GP"] = stats["W"] + stats["L"] + stats["T"]
    # Win percentage with ties counting as half a win
    stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan)
    # Run differential (runs scored - runs allowed)
    stats["RunDiff"] = stats["RS"] - stats["RA"]
    return stats.reset_index()
 def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series:
    """
    Compute Pythagorean expectation for winning percentage:
    RS^exp / (RS^exp + RA^exp), handling zero or missing runs.
    Parameters:
    - rs: runs scored
    - ra: runs allowed
    - exp: exponent (typically ~1.83 for baseball)
    Returns:
    Series of expected win percentages
    """
    rs = rs.clip(lower=0); ra = ra.clip(lower=0)
    num = np.power(rs, exp); den = num + np.power(ra, exp)
    with np.errstate(divide="ignore", invalid="ignore"):
-
+        p = np.where(den > 0, num / den, 0.5)
        p = np.where(den > 0, num / den, 0.5)  # handle zero denominator as 0.5 (neutral)
    return pd.Series(p, index=rs.index)
 def estimate_home_field_runs(df: pd.DataFrame) -> float:
    """
    Estimate home-field advantage in runs as the average margin (home_runs - away_runs).
    Useful for adjusting rating systems to neutralize advantage.
    Returns:
    Float average home-field runs advantage.
    """
    return float(df["Margin"].mean()) if len(df) else 0.0
 def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]:
    """
    Calculate Massey ratings (simple linear system) for teams using margins of victory.
    Optionally caps margins and subtracts estimated home field runs.
    Parameters:
    - df: games DataFrame with HomeTeam, AwayTeam, Margin columns
    - cap: maximum absolute margin value to use (run cap)
    - subtract_home: whether to subtract estimated home field runs advantage
    Returns:
    Tuple of (ratings Series indexed by team, estimated home-run advantage float)
    """
    teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
    idx = {t:i for i,t in enumerate(teams)}
    y = df["Margin"].astype(float).to_numpy()
@@ -178,107 +111,48 @@ def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series
    if subtract_home:
        y = y - h_est
    G, N = len(df), len(teams)
    # Construct design matrix A with +1 for home, -1 for away per game, plus normalization row
    A = np.zeros((G+1, N), dtype=float)
    for r_i, r in enumerate(df.itertuples(index=False)):
        A[r_i, idx[r.HomeTeam]] = 1.0
        A[r_i, idx[r.AwayTeam]] = -1.0
    # Normalize ratings sum to zero for uniqueness
    A[G, :] = 1.0
    y_ext = np.concatenate([y, [0.0]])
    # Solve least squares for ratings vector
    r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None)
    return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0)
 def elo_expected(ra: float, rb: float) -> float:
    """
    Compute Elo expected probability (expected score) for player A.
    Parameters:
    - ra: rating of player A
    - rb: rating of player B
    Returns:
    Probability player A wins
    """
    return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0))
 def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]:
    """
    Perform one pass of Elo rating updates across the games in chronological order.
    Parameters:
    - df: DataFrame with games (must have HomeTeam, AwayTeam, HomeRuns, AwayRuns)
    - K: Elo K-factor (adjustment multiplier)
    - H: home field bonus in points
    - mcap: cap for margin of victory factor ln(|margin| + 1)
    - init: dict of initial ratings by team
    Returns:
    Updated dict of Elo ratings after processing games.
    """
    ratings = dict(init)
    for _, r in df.iterrows():
        h, a = r["HomeTeam"], r["AwayTeam"]
        hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
        margin = hr - ar
        # Calculate expected win probability for home team (with home advantage added)
        Eh = elo_expected(ratings[h] + H, ratings[a])
        # Actual game result scores (1 for win, 0 for loss, 0.5 tie)
        Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5))
        # Margin factor based on logarithm of absolute margin plus one
        M = np.log(abs(margin) + 1.0)
        if mcap is not None:
            M = min(M, mcap)
        # Elo rating update, scaled by margin factor and difference between actual and expected score
        ratings[h] += K * M * (Sh - Eh)
        ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh))
    return ratings
 def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series:
    """
    Compute Elo ratings averaged over multiple random shuffle orders of games
    to reduce order dependency of sequential Elo updates.
    Parameters:
    - df: games DataFrame sorted by Date
    - K: Elo K-factor
    - H: home field advantage bonus
    - mcap: margin factor cap
    - shuffles: number of random game orders to compute Elo over
    - seed: RNG seed for reproducibility
    Returns:
    Series of Elo ratings indexed by team
    """
    teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
-
+    base = {t: 1500.0 for t in teams}
    base = {t: 1500.0 for t in teams}  # initial Elo ratings
    df0 = df.sort_values(["Date"]).reset_index(drop=True)
    # Elo with original date order (baseline)
    r_first = elo_once(df0, K, H, mcap, base)
    # Initialize RNG
    rng = np.random.default_rng(seed)
    vals = {t: [r_first[t]] for t in teams}
    # Compute Elo over randomized orderings for averaging
    for _ in range(max(0, shuffles-1)):
        idx = np.arange(len(df0)); rng.shuffle(idx)
        r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base)
        for t in teams:
            vals[t].append(r[t])
    # Average ratings across runs for each team
    return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index()
 def zscore(s: pd.Series) -> pd.Series:
    """
    Calculate z-score (standard score) for a pandas Series.
    Parameters:
    - s: input Series
    Returns:
    Series normalized to mean=0 and std=1; zeros if std=0.
    """
    mu, sd = s.mean(), s.std(ddof=0)
    return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd
@@ -302,68 +176,47 @@ def main(
    elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"),
    elo_seed: int = typer.Option(42, help="RNG seed for shuffles")
 ):
    """
    Main entry point:
    Loads input games, computes aggregate stats, Pythagorean expectation,
    Massey ratings, Elo ratings (averaged over shuffles), Strength of Schedule,
    and an overall CompositeRating combining these metrics.
    Outputs a CSV file with rankings and stats.
    """
    team_id = team_id.lower()
-
+    # Load games
    # Load cleaned games DataFrame
    games = load_games(inp, team_id=team_id, final_status=final_status)
-
+    # Aggregates
    # Compute aggregated team-level statistics from games
    team = aggregate_team_stats(games)
    # Calculate Pythagorean expected winning percentage
    team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp)
-
+    # Ratings
    # Calculate Massey ratings and get estimated home field runs
    massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj)
-
+    # Strength of schedule
    # Calculate Strength of Schedule as average Massey rating of opponents
    opps = {t: [] for t in massey_r.index}
    for _, r in games.iterrows():
        opps[r["HomeTeam"]].append(r["AwayTeam"])
        opps[r["AwayTeam"]].append(r["HomeTeam"])
    sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps})
    # Compute Elo ratings with multiple shuffles for stability
    elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed)
-
+    # Merge
    # Merge all metrics into a single DataFrame
    out_df = team.set_index("Team")
    out_df["MasseyRating"] = massey_r
    out_df["EloRating"] = elo_r
    out_df["StrengthOfSchedule"] = sos_series
-
+    # Composite
    # Composite rating: weighted Z-score combination of Massey, Elo, and Pythagorean
    Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"])
    out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p
    out_df = out_df.reset_index()
    # Select columns and sort teams by CompositeRating descending
    out_df = out_df[[
        "Team","GP","W","L","T","WinPct","RS","RA","RunDiff",
        "PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"
    ]].sort_values("CompositeRating", ascending=False)
-
+    # Round for readability
    # Round numeric columns for neatness
    for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]:
        out_df[c] = out_df[c].astype(float).round(5)
    # Write to output CSV
    out_df.to_csv(out, index=False)
    # Output summary info
    print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}")
    print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}")
    print(f"Output -> {out}")