diff --git a/compute_ratings.py b/compute_ratings.py index eda23f2..88daa05 100644 --- a/compute_ratings.py +++ b/compute_ratings.py @@ -29,20 +29,8 @@ def load_games( team_id: str = "names", final_status: str | None = None, ) -> pd.DataFrame: - """ - Load input CSV (season_schedule.csv) into a cleaned DataFrame with consistent columns. - - Parameters: - - inp: CSV path to read - - team_id: 'names' or 'slugs' to identify teams - - final_status: if given, filter rows with status matching this (e.g. 'final') - - Returns: - DataFrame with columns Date, HomeTeam, AwayTeam, HomeRuns, AwayRuns, Margin, Result - """ df = pd.read_csv(inp) # Choose identifiers - # Determine team ID columns based on input param home_id_col = "home_name" if team_id == "names" else "home_slug" away_id_col = "away_name" if team_id == "names" else "away_slug" for c in [home_id_col, away_id_col, "home_runs", "away_runs"]: @@ -50,31 +38,27 @@ def load_games( raise ValueError(f"Missing required column: {c}") # Optional status filter (helps exclude postponed/canceled) - # Filter for final_status if provided to exclude e.g. postponed games if final_status is not None and "status" in df.columns: df = df[df["status"].astype(str).str.lower() == str(final_status).lower()] - # Convert run columns to numeric, drop rows with missing runs or teams + # Keep only games with numeric scores df = df.copy() df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce") df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce") df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"]) - - # Parse datetime by combining date_local and time_local if possible + # Parse datetime (robust to missing either field) date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce") time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time - + # Combine when possible dt = date if "time_local" in df.columns: - - # Build datetime where both date and time present + # build datetime only where both present dt = pd.to_datetime( date.dt.strftime("%Y-%m-%d").fillna("") + " " + pd.Series(time).astype(str).replace("NaT",""), errors="coerce" ) - # Construct cleaned DataFrame with fixed column names df_out = pd.DataFrame({ "Date": dt, "HomeTeam": df[home_id_col].astype(str), @@ -82,35 +66,19 @@ def load_games( "HomeRuns": df["home_runs"].astype(int), "AwayRuns": df["away_runs"].astype(int), }) - # Margin is difference in runs (home - away) df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"] - # Result: 'H' if home win, 'A' if away win, 'T' for tie df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H", np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T")) return df_out.reset_index(drop=True) def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame: - """ - Aggregate game-level data into team-level season stats: wins, losses, ties, runs scored, - runs allowed, games played, win percentage, and run differential. - - Parameters: - - df: DataFrame with game results - - Returns: - DataFrame indexed by Team with aggregated stats - """ - # Collect all team names from home and away columns teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team") - # Initialize stats DataFrame with W/L/T/RS/RA all zero stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0) for _, r in df.iterrows(): h, a = r["HomeTeam"], r["AwayTeam"] hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) - # Update runs scored and allowed for both teams stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr - # Update win/loss/tie counts if hr > ar: stats.at[h,"W"] += 1; stats.at[a,"L"] += 1 elif hr < ar: @@ -118,57 +86,22 @@ def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame: else: stats.at[h,"T"] += 1; stats.at[a,"T"] += 1 stats = stats.astype(int) - # Games played stats["GP"] = stats["W"] + stats["L"] + stats["T"] - # Win percentage with ties counting as half a win stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan) - # Run differential (runs scored - runs allowed) stats["RunDiff"] = stats["RS"] - stats["RA"] return stats.reset_index() def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series: - """ - Compute Pythagorean expectation for winning percentage: - RS^exp / (RS^exp + RA^exp), handling zero or missing runs. - - Parameters: - - rs: runs scored - - ra: runs allowed - - exp: exponent (typically ~1.83 for baseball) - - Returns: - Series of expected win percentages - """ rs = rs.clip(lower=0); ra = ra.clip(lower=0) num = np.power(rs, exp); den = num + np.power(ra, exp) with np.errstate(divide="ignore", invalid="ignore"): - - p = np.where(den > 0, num / den, 0.5) # handle zero denominator as 0.5 (neutral) + p = np.where(den > 0, num / den, 0.5) return pd.Series(p, index=rs.index) def estimate_home_field_runs(df: pd.DataFrame) -> float: - """ - Estimate home-field advantage in runs as the average margin (home_runs - away_runs). - Useful for adjusting rating systems to neutralize advantage. - - Returns: - Float average home-field runs advantage. - """ return float(df["Margin"].mean()) if len(df) else 0.0 def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]: - """ - Calculate Massey ratings (simple linear system) for teams using margins of victory. - Optionally caps margins and subtracts estimated home field runs. - - Parameters: - - df: games DataFrame with HomeTeam, AwayTeam, Margin columns - - cap: maximum absolute margin value to use (run cap) - - subtract_home: whether to subtract estimated home field runs advantage - - Returns: - Tuple of (ratings Series indexed by team, estimated home-run advantage float) - """ teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) idx = {t:i for i,t in enumerate(teams)} y = df["Margin"].astype(float).to_numpy() @@ -178,107 +111,48 @@ def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series if subtract_home: y = y - h_est G, N = len(df), len(teams) - # Construct design matrix A with +1 for home, -1 for away per game, plus normalization row A = np.zeros((G+1, N), dtype=float) for r_i, r in enumerate(df.itertuples(index=False)): A[r_i, idx[r.HomeTeam]] = 1.0 A[r_i, idx[r.AwayTeam]] = -1.0 - # Normalize ratings sum to zero for uniqueness A[G, :] = 1.0 y_ext = np.concatenate([y, [0.0]]) - # Solve least squares for ratings vector r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None) return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0) def elo_expected(ra: float, rb: float) -> float: - """ - Compute Elo expected probability (expected score) for player A. - - Parameters: - - ra: rating of player A - - rb: rating of player B - - Returns: - Probability player A wins - """ return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0)) def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]: - """ - Perform one pass of Elo rating updates across the games in chronological order. - - Parameters: - - df: DataFrame with games (must have HomeTeam, AwayTeam, HomeRuns, AwayRuns) - - K: Elo K-factor (adjustment multiplier) - - H: home field bonus in points - - mcap: cap for margin of victory factor ln(|margin| + 1) - - init: dict of initial ratings by team - - Returns: - Updated dict of Elo ratings after processing games. - """ ratings = dict(init) for _, r in df.iterrows(): h, a = r["HomeTeam"], r["AwayTeam"] hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) margin = hr - ar - # Calculate expected win probability for home team (with home advantage added) Eh = elo_expected(ratings[h] + H, ratings[a]) - # Actual game result scores (1 for win, 0 for loss, 0.5 tie) Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5)) - # Margin factor based on logarithm of absolute margin plus one M = np.log(abs(margin) + 1.0) if mcap is not None: M = min(M, mcap) - # Elo rating update, scaled by margin factor and difference between actual and expected score ratings[h] += K * M * (Sh - Eh) ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh)) return ratings def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series: - """ - Compute Elo ratings averaged over multiple random shuffle orders of games - to reduce order dependency of sequential Elo updates. - - Parameters: - - df: games DataFrame sorted by Date - - K: Elo K-factor - - H: home field advantage bonus - - mcap: margin factor cap - - shuffles: number of random game orders to compute Elo over - - seed: RNG seed for reproducibility - - Returns: - Series of Elo ratings indexed by team - """ teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) - - base = {t: 1500.0 for t in teams} # initial Elo ratings + base = {t: 1500.0 for t in teams} df0 = df.sort_values(["Date"]).reset_index(drop=True) - # Elo with original date order (baseline) r_first = elo_once(df0, K, H, mcap, base) - # Initialize RNG rng = np.random.default_rng(seed) vals = {t: [r_first[t]] for t in teams} - # Compute Elo over randomized orderings for averaging for _ in range(max(0, shuffles-1)): idx = np.arange(len(df0)); rng.shuffle(idx) r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base) for t in teams: vals[t].append(r[t]) - # Average ratings across runs for each team return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index() def zscore(s: pd.Series) -> pd.Series: - """ - Calculate z-score (standard score) for a pandas Series. - - Parameters: - - s: input Series - - Returns: - Series normalized to mean=0 and std=1; zeros if std=0. - """ mu, sd = s.mean(), s.std(ddof=0) return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd @@ -302,68 +176,47 @@ def main( elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"), elo_seed: int = typer.Option(42, help="RNG seed for shuffles") ): - """ - Main entry point: - - Loads input games, computes aggregate stats, Pythagorean expectation, - Massey ratings, Elo ratings (averaged over shuffles), Strength of Schedule, - and an overall CompositeRating combining these metrics. - - Outputs a CSV file with rankings and stats. - """ team_id = team_id.lower() - - # Load cleaned games DataFrame + # Load games games = load_games(inp, team_id=team_id, final_status=final_status) - - # Compute aggregated team-level statistics from games + # Aggregates team = aggregate_team_stats(games) - # Calculate Pythagorean expected winning percentage team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp) - - # Calculate Massey ratings and get estimated home field runs + # Ratings massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj) - - # Calculate Strength of Schedule as average Massey rating of opponents + # Strength of schedule opps = {t: [] for t in massey_r.index} for _, r in games.iterrows(): opps[r["HomeTeam"]].append(r["AwayTeam"]) opps[r["AwayTeam"]].append(r["HomeTeam"]) sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps}) - # Compute Elo ratings with multiple shuffles for stability elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed) - - # Merge all metrics into a single DataFrame + # Merge out_df = team.set_index("Team") out_df["MasseyRating"] = massey_r out_df["EloRating"] = elo_r out_df["StrengthOfSchedule"] = sos_series - - # Composite rating: weighted Z-score combination of Massey, Elo, and Pythagorean + # Composite Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"]) out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p out_df = out_df.reset_index() - # Select columns and sort teams by CompositeRating descending out_df = out_df[[ "Team","GP","W","L","T","WinPct","RS","RA","RunDiff", "PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating" ]].sort_values("CompositeRating", ascending=False) - - # Round numeric columns for neatness + # Round for readability for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]: out_df[c] = out_df[c].astype(float).round(5) - # Write to output CSV out_df.to_csv(out, index=False) - # Output summary info print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}") print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}") print(f"Output -> {out}")