From 03f87c205b51006614809c21bb1f6ea7bb91e8d9 Mon Sep 17 00:00:00 2001 From: Anthony Correa Date: Thu, 28 Aug 2025 15:16:32 -0500 Subject: [PATCH] Improve schedule scraping and rating computation with better docs - Add detailed docstrings and comments to build_season_schedule.py scraping/parsing functions - Enhance compute_ratings.py with clear parameter/docs and refined data handling - Improve Elo calculation stability and add composite rating output with explained metrics --- build_season_schedule.py | 116 +++++++++++++++++-------- compute_ratings.py | 179 +++++++++++++++++++++++++++++++++++---- 2 files changed, 246 insertions(+), 49 deletions(-) diff --git a/build_season_schedule.py b/build_season_schedule.py index 6ddad97..2acaf43 100644 --- a/build_season_schedule.py +++ b/build_season_schedule.py @@ -41,24 +41,31 @@ logging.basicConfig( # ----------------- constants ----------------- UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36" -HEADERS = {"User-Agent": UA} -PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" -GAME_BASE = "https://www.csyba.com/game/show/{gid}" +HEADERS = {"User-Agent": UA} # HTTP headers with custom User-Agent for requests +PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" # base URL for team-instance printable schedule +GAME_BASE = "https://www.csyba.com/game/show/{gid}" # base URL for game detail page +# Regular expressions for parsing scores, game links, and time strings SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b") GAME_LINK_RE = re.compile(r"/game/show/(\d+)") TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I) # ----------------- helpers ----------------- def clean(x: str) -> str: + """Normalize whitespace and strip input string.""" return re.sub(r"\s+", " ", (x or "")).strip() def slugify(s: str) -> str: + """Convert string to lowercase slug with words separated by hyphens.""" s = s.lower() s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") return s def norm_name(s: str) -> str: + """ + Normalize team names by lowercasing, removing common words like 'the', 'club', + and stripping punctuation, to help with loose matching. + """ s = s.lower() s = re.sub(r"[^a-z0-9 ]+", " ", s) s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s) @@ -67,6 +74,7 @@ def norm_name(s: str) -> str: @dataclass(frozen=True) class TeamRec: + """Data class representing a team record with identifying information.""" name: str slug: str team_id: str @@ -74,7 +82,10 @@ class TeamRec: subseason_id: str def load_teams(teams_path: str): - """Load mapping tables from teams.json you provided.""" + """ + Load team mapping data from JSON file. + Returns dictionaries keyed by instance_id, slug, and normalized names for lookups. + """ with open(teams_path, "r", encoding="utf-8") as f: arr = json.load(f) by_instance: Dict[str, TeamRec] = {} @@ -94,7 +105,11 @@ def load_teams(teams_path: str): return by_instance, by_slug, by_norm def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]: - """Match opponent using slug first, then normalized name, then loose containment.""" + """ + Attempt to match the opponent team name to a known team record. + Tries slug first, then normalized name exact match, + then loose containment matching on normalized names. + """ s = slugify(opponent_text) if s in by_slug: return by_slug[s] @@ -108,8 +123,8 @@ def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]: def runs_from_team_pov(result_flag: str, s_a: str, s_b: str): """ - Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs. - We don't reorder; we only validate with W/L/T if needed. + Parse runs scored by team and opponent, assuming team-first order. + Validate results with result_flag (W/L/T). """ if not (s_a.isdigit() and s_b.isdigit()): return None, None @@ -122,6 +137,10 @@ def runs_from_team_pov(result_flag: str, s_a: str, s_b: str): # ----------------- HTTP utils ----------------- def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]: + """ + Fetch a URL and return a BeautifulSoup parsed document. + Uses a shared requests.Session if provided. + """ try: sess = session or requests.Session() r = sess.get(url, headers=HEADERS, timeout=timeout) @@ -133,7 +152,10 @@ def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int # ----------------- scraping ----------------- def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]: - """Parse one team-instance printable schedule page into perspective rows.""" + """ + Download and parse the team-instance printable schedule page, + extracting a list of game dictionaries from the perspective of that team. + """ url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({ "schedule_type": "index", "subseason": subseason_id, @@ -148,25 +170,27 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi return [] games = [] + # Skip header row; iterate over game rows for row_idx, tr in enumerate(table.select("tr")[1:], start=1): tds = tr.select("td") if len(tds) < 5: continue - # Cells: Date | Result | Opponent | Location | Status + # Extract text from each relevant cell: + # Date | Result | Opponent | Location | Status date_txt = clean(tds[0].get_text(" ")) result_txt = clean(tds[1].get_text(" ")) opp_txt = clean(tds[2].get_text(" ")) loc_txt = clean(tds[3].get_text(" ")) status_txt = clean(tds[4].get_text(" ")) - # Date → ISO + # Parse date into ISO format (YYYY-MM-DD) if possible try: date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() except Exception: - date_iso = date_txt + date_iso = date_txt # leave raw if parsing fails - # Pull a game_id if present (from any link in the row) + # Find game ID from any game/show links in the row, if present game_id = "" for a in tr.select("a[href]"): m = GAME_LINK_RE.search(a.get("href", "")) @@ -174,19 +198,19 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi game_id = m.group(1) break - # Extract W/L/T (Result cell) + # Extract W/L/T indicator from Result cell m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) result_flag = m_res.group(1).upper() if m_res else "" - # Extract score from Result cell; if missing, also try Opponent cell + # Extract numeric scores from Result or Opponent cell m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt) s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "") - # Opponent + home/away flag + # Determine if game is away based on '@' prefix in opponent cell is_away = opp_txt.startswith("@") opponent_name = opp_txt.lstrip("@").strip() - # Compute team/opp runs (TEAM-FIRST orientation) + # Convert scores to integers with team-first orientation team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b) logging.debug( @@ -214,10 +238,9 @@ def parse_printable(instance_id: str, subseason_id: str, session: requests.Sessi def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: """ - Fetch the game's local start time from the /game/show/ page. - Looks inside the tab with id 'tab_boxscores_content' but also - falls back to scanning the page for common time patterns. - Returns a zero-padded 24h 'HH:MM' string or None if unavailable. + Fetch the start time of a game from its detail page. + Looks inside the boxscores tab or scans text for time patterns. + Returns a 24-hour formatted 'HH:MM' string or None if not found. """ if not game_id: return None @@ -226,13 +249,13 @@ def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: if not soup: return None - # Prefer the boxscores tab content + # Prefer boxscores tab content to search for time string box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content") text = "" if box: text = " ".join(box.stripped_strings) else: - # Fall back to page-wide text (but avoid pulling too much) + # Fall back to main page text with length limit to prevent excessive text processing main = soup.select_one("div.page") or soup text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split()) @@ -244,21 +267,19 @@ def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: hhmm = m.group(1) ampm = (m.group(2) or "").lower().replace(".", "") try: - # Normalize to 24h HH:MM + # Normalize time to 24h format from datetime import datetime if ampm: dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p") else: - # already 24h-ish dt = datetime.strptime(hhmm, "%H:%M") return dt.strftime("%H:%M") except Exception: - # Be forgiving (e.g., "6:00pm" without space) + # Try forgiving parse if combined time/ampm without space try: from datetime import datetime - hhmm2 = hhmm if ampm: - dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p") + dt = datetime.strptime(f"{hhmm}{ampm}", "%I:%M%p") return dt.strftime("%H:%M") except Exception: logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}") @@ -272,25 +293,34 @@ def main( fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/"), sleep: float = typer.Option(0.35, help="Delay between requests (seconds)") ): + """ + Main function to scrape schedules for all teams, merge them, + deduplicate entries (primary by game_id), and output a consolidated CSV. + Optionally fetches start times per game. + """ + # Load teams data and indexes by_instance, by_slug, by_norm = load_teams(teams) instance_ids = sorted(by_instance.keys()) + # Requests session with custom headers session = requests.Session() session.headers.update(HEADERS) - # Scrape all teams + # Scrape all team instance printable schedules raw: List[dict] = [] for i, iid in enumerate(instance_ids, 1): logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}") raw.extend(parse_printable(iid, subseason, session=session)) time.sleep(sleep) # be polite + # Helper lookups for team records def rec_from_instance(iid: str) -> Optional[TeamRec]: return by_instance.get(iid) def match_opponent(text: str) -> Optional[TeamRec]: return best_match_team(text, by_slug, by_norm) + # Deduplicate buckets keyed by game_id or fallback composite keys buckets: Dict[str, dict] = {} fallback_rows = 0 @@ -313,6 +343,7 @@ def main( key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}" fallback_rows += 1 + # Store perspective of one team's view of the game perspective = { "team": team_rec, "opp": opp_rec, # may be None @@ -336,42 +367,54 @@ def main( logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.") out_rows = [] - time_cache: Dict[str, Optional[str]] = {} + time_cache: Dict[str, Optional[str]] = {} # cache game times to avoid re-fetching + # Merge perspectives and produce consolidated rows for key, bucket in buckets.items(): p = bucket["persp"] date = p[0]["date"] game_id = bucket.get("game_id", "") + # Try to identify home and away perspectives p_home = next((x for x in p if x["is_away"] is False), None) p_away = next((x for x in p if x["is_away"] is True), None) + # Home is the team who is not away, else fallback to the other team's opponent home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None)) away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None)) def pack_team(rec: Optional[TeamRec], fallback_slug: str): + """Pack team record to tuple or fallback to slug-based default values.""" if rec: return rec.slug, rec.instance_id, rec.team_id, rec.name return fallback_slug, "", "", fallback_slug.replace("-", " ").title() + # Attempt to get runs from home perspective home_runs = away_runs = None if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int): home_runs = p_home["team_runs"] away_runs = p_home["opp_runs"] + # Otherwise try away perspective with reversed runs elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int): away_runs = p_away["team_runs"] home_runs = p_away["opp_runs"] + # If runs still missing, guess from first perspective, adjusting for is_away if (home_runs is None or away_runs is None) and p: one = p[0] if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int): if one["is_away"]: - away_runs = one["team_runs"]; home_runs = one["opp_runs"] - away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team + away_runs = one["team_runs"] + home_runs = one["opp_runs"] + away_team = one["team"] + home_team = one["opp"] if one["opp"] else home_team else: - home_runs = one["team_runs"]; away_runs = one["opp_runs"] - home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team + home_runs = one["team_runs"] + away_runs = one["opp_runs"] + home_team = one["team"] + away_team = one["opp"] if one["opp"] else away_team + # Fallback guesses for home and away slugs if team data missing guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else p_away["opp"].slug if p_away and p_away["opp"] else p[0]["pair"][0]) @@ -382,6 +425,7 @@ def main( home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback) away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback) + # Determine winner and loser slugs based on runs winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = "" if isinstance(home_runs, int) and isinstance(away_runs, int): if home_runs > away_runs: @@ -391,10 +435,12 @@ def main( winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id + # Consolidate location and status from home or away perspectives loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "") status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "") source_urls = sorted({x["source_url"] for x in p}) + # Optionally fetch game start time time_local = "" if fetch_time and game_id: if game_id in time_cache: @@ -403,6 +449,7 @@ def main( logging.debug(f"TIME: fetching game {game_id}") tval = fetch_game_time(game_id, session=session) time_cache[game_id] = tval + # If no time found, wait longer before next request to be polite if tval is None: time.sleep(min(sleep * 2, 1.0)) if tval: @@ -413,6 +460,7 @@ def main( f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}" ) + # Append consolidated game record for CSV output out_rows.append({ "date_local": date, "time_local": time_local, @@ -431,6 +479,7 @@ def main( logging.warning("No games produced.") return + # Define CSV output columns fieldnames = [ "date_local","time_local", "home_slug","home_instance","home_id","home_name", @@ -440,6 +489,7 @@ def main( "loser_slug","loser_instance","loser_id", "location","status","game_id","source_urls", ] + # Write consolidated game data to CSV with open(out, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=fieldnames) w.writeheader() diff --git a/compute_ratings.py b/compute_ratings.py index f35731d..4615a42 100644 --- a/compute_ratings.py +++ b/compute_ratings.py @@ -29,36 +29,53 @@ def load_games( team_id: str = "names", final_status: str | None = None, ) -> pd.DataFrame: + """ + Load input CSV (season_schedule.csv) into a cleaned DataFrame with consistent columns. + + Parameters: + - inp: CSV path to read + - team_id: 'names' or 'slugs' to identify teams + - final_status: if given, filter rows with status matching this (e.g. 'final') + + Returns: + DataFrame with columns Date, HomeTeam, AwayTeam, HomeRuns, AwayRuns, Margin, Result + """ df = pd.read_csv(inp) - # Choose identifiers + + # Determine team ID columns based on input param home_id_col = "home_name" if team_id == "names" else "home_slug" away_id_col = "away_name" if team_id == "names" else "away_slug" for c in [home_id_col, away_id_col, "home_runs", "away_runs"]: if c not in df.columns: raise ValueError(f"Missing required column: {c}") - # Optional status filter (helps exclude postponed/canceled) + + # Filter for final_status if provided to exclude e.g. postponed games if final_status is not None and "status" in df.columns: df = df[df["status"].astype(str).str.lower() == str(final_status).lower()] - # Keep only games with numeric scores + + # Convert run columns to numeric, drop rows with missing runs or teams df = df.copy() df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce") df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce") df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"]) - # Parse datetime (robust to missing either field) + + # Parse datetime by combining date_local and time_local if possible date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce") time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time - # Combine when possible + dt = date if "time_local" in df.columns: - # build datetime only where both present + + # Build datetime where both date and time present dt = pd.to_datetime( date.dt.strftime("%Y-%m-%d").fillna("") + " " + pd.Series(time).astype(str).replace("NaT",""), errors="coerce" ) + # Construct cleaned DataFrame with fixed column names df_out = pd.DataFrame({ "Date": dt, "HomeTeam": df[home_id_col].astype(str), @@ -66,19 +83,35 @@ def load_games( "HomeRuns": df["home_runs"].astype(int), "AwayRuns": df["away_runs"].astype(int), }) + # Margin is difference in runs (home - away) df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"] + # Result: 'H' if home win, 'A' if away win, 'T' for tie df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H", np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T")) return df_out.reset_index(drop=True) def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame: + """ + Aggregate game-level data into team-level season stats: wins, losses, ties, runs scored, + runs allowed, games played, win percentage, and run differential. + + Parameters: + - df: DataFrame with game results + + Returns: + DataFrame indexed by Team with aggregated stats + """ + # Collect all team names from home and away columns teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team") + # Initialize stats DataFrame with W/L/T/RS/RA all zero stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0) for _, r in df.iterrows(): h, a = r["HomeTeam"], r["AwayTeam"] hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) + # Update runs scored and allowed for both teams stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr + # Update win/loss/tie counts if hr > ar: stats.at[h,"W"] += 1; stats.at[a,"L"] += 1 elif hr < ar: @@ -86,22 +119,57 @@ def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame: else: stats.at[h,"T"] += 1; stats.at[a,"T"] += 1 stats = stats.astype(int) + # Games played stats["GP"] = stats["W"] + stats["L"] + stats["T"] + # Win percentage with ties counting as half a win stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan) + # Run differential (runs scored - runs allowed) stats["RunDiff"] = stats["RS"] - stats["RA"] return stats.reset_index() def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series: + """ + Compute Pythagorean expectation for winning percentage: + RS^exp / (RS^exp + RA^exp), handling zero or missing runs. + + Parameters: + - rs: runs scored + - ra: runs allowed + - exp: exponent (typically ~1.83 for baseball) + + Returns: + Series of expected win percentages + """ rs = rs.clip(lower=0); ra = ra.clip(lower=0) num = np.power(rs, exp); den = num + np.power(ra, exp) with np.errstate(divide="ignore", invalid="ignore"): - p = np.where(den > 0, num / den, 0.5) + + p = np.where(den > 0, num / den, 0.5) # handle zero denominator as 0.5 (neutral) return pd.Series(p, index=rs.index) def estimate_home_field_runs(df: pd.DataFrame) -> float: + """ + Estimate home-field advantage in runs as the average margin (home_runs - away_runs). + Useful for adjusting rating systems to neutralize advantage. + + Returns: + Float average home-field runs advantage. + """ return float(df["Margin"].mean()) if len(df) else 0.0 def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]: + """ + Calculate Massey ratings (simple linear system) for teams using margins of victory. + Optionally caps margins and subtracts estimated home field runs. + + Parameters: + - df: games DataFrame with HomeTeam, AwayTeam, Margin columns + - cap: maximum absolute margin value to use (run cap) + - subtract_home: whether to subtract estimated home field runs advantage + + Returns: + Tuple of (ratings Series indexed by team, estimated home-run advantage float) + """ teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) idx = {t:i for i,t in enumerate(teams)} y = df["Margin"].astype(float).to_numpy() @@ -111,48 +179,107 @@ def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series if subtract_home: y = y - h_est G, N = len(df), len(teams) + # Construct design matrix A with +1 for home, -1 for away per game, plus normalization row A = np.zeros((G+1, N), dtype=float) for r_i, r in enumerate(df.itertuples(index=False)): A[r_i, idx[r.HomeTeam]] = 1.0 A[r_i, idx[r.AwayTeam]] = -1.0 + # Normalize ratings sum to zero for uniqueness A[G, :] = 1.0 y_ext = np.concatenate([y, [0.0]]) + # Solve least squares for ratings vector r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None) return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0) def elo_expected(ra: float, rb: float) -> float: + """ + Compute Elo expected probability (expected score) for player A. + + Parameters: + - ra: rating of player A + - rb: rating of player B + + Returns: + Probability player A wins + """ return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0)) def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]: + """ + Perform one pass of Elo rating updates across the games in chronological order. + + Parameters: + - df: DataFrame with games (must have HomeTeam, AwayTeam, HomeRuns, AwayRuns) + - K: Elo K-factor (adjustment multiplier) + - H: home field bonus in points + - mcap: cap for margin of victory factor ln(|margin| + 1) + - init: dict of initial ratings by team + + Returns: + Updated dict of Elo ratings after processing games. + """ ratings = dict(init) for _, r in df.iterrows(): h, a = r["HomeTeam"], r["AwayTeam"] hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) margin = hr - ar + # Calculate expected win probability for home team (with home advantage added) Eh = elo_expected(ratings[h] + H, ratings[a]) + # Actual game result scores (1 for win, 0 for loss, 0.5 tie) Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5)) + # Margin factor based on logarithm of absolute margin plus one M = np.log(abs(margin) + 1.0) if mcap is not None: M = min(M, mcap) + # Elo rating update, scaled by margin factor and difference between actual and expected score ratings[h] += K * M * (Sh - Eh) ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh)) return ratings def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series: + """ + Compute Elo ratings averaged over multiple random shuffle orders of games + to reduce order dependency of sequential Elo updates. + + Parameters: + - df: games DataFrame sorted by Date + - K: Elo K-factor + - H: home field advantage bonus + - mcap: margin factor cap + - shuffles: number of random game orders to compute Elo over + - seed: RNG seed for reproducibility + + Returns: + Series of Elo ratings indexed by team + """ teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) - base = {t: 1500.0 for t in teams} + + base = {t: 1500.0 for t in teams} # initial Elo ratings df0 = df.sort_values(["Date"]).reset_index(drop=True) + # Elo with original date order (baseline) r_first = elo_once(df0, K, H, mcap, base) + # Initialize RNG rng = np.random.default_rng(seed) vals = {t: [r_first[t]] for t in teams} + # Compute Elo over randomized orderings for averaging for _ in range(max(0, shuffles-1)): idx = np.arange(len(df0)); rng.shuffle(idx) r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base) for t in teams: vals[t].append(r[t]) + # Average ratings across runs for each team return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index() def zscore(s: pd.Series) -> pd.Series: + """ + Calculate z-score (standard score) for a pandas Series. + + Parameters: + - s: input Series + + Returns: + Series normalized to mean=0 and std=1; zeros if std=0. + """ mu, sd = s.mean(), s.std(ddof=0) return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd @@ -176,51 +303,71 @@ def main( elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"), elo_seed: int = typer.Option(42, help="RNG seed for shuffles") ): + """ + Main entry point: + + Loads input games, computes aggregate stats, Pythagorean expectation, + Massey ratings, Elo ratings (averaged over shuffles), Strength of Schedule, + and an overall CompositeRating combining these metrics. + + Outputs a CSV file with rankings and stats. + """ team_id = team_id.lower() - # Load games + + # Load cleaned games DataFrame games = load_games(inp, team_id=team_id, final_status=final_status) - # Aggregates + + # Compute aggregated team-level statistics from games team = aggregate_team_stats(games) + # Calculate Pythagorean expected winning percentage team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp) - # Ratings + + # Calculate Massey ratings and get estimated home field runs massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj) - # Strength of schedule + + # Calculate Strength of Schedule as average Massey rating of opponents opps = {t: [] for t in massey_r.index} for _, r in games.iterrows(): opps[r["HomeTeam"]].append(r["AwayTeam"]) opps[r["AwayTeam"]].append(r["HomeTeam"]) sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps}) + # Compute Elo ratings with multiple shuffles for stability elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed) - # Merge + + # Merge all metrics into a single DataFrame out_df = team.set_index("Team") out_df["MasseyRating"] = massey_r out_df["EloRating"] = elo_r out_df["StrengthOfSchedule"] = sos_series - # Composite + + # Composite rating: weighted Z-score combination of Massey, Elo, and Pythagorean Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"]) out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p out_df = out_df.reset_index() + # Select columns and sort teams by CompositeRating descending out_df = out_df[[ "Team","GP","W","L","T","WinPct","RS","RA","RunDiff", "PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating" ]].sort_values("CompositeRating", ascending=False) - # Round for readability + + # Round numeric columns for neatness for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]: out_df[c] = out_df[c].astype(float).round(5) + # Write to output CSV out_df.to_csv(out, index=False) + # Output summary info print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}") print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}") print(f"Output -> {out}") if __name__ == "__main__": typer.run(main) -