csyba/build_season_schedule.py

#!/usr/bin/env python3
# build_season_schedule.py
#
# Build a deduped season schedule from SportsEngine team-instance printable pages.
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
# - Determines home/away using the '@' marker on the opponent cell.
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
#
# Usage:
#   pip install requests beautifulsoup4 python-dateutil
#   python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
#
# Example teams.json (array):
# [
#   {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
#   ...
# ]

import argparse
import csv
import json
import logging
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp
import typer

# ----------------- logging -----------------
logging.basicConfig(
    level=logging.INFO,  # change to DEBUG for verbose tracing
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)

# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA}  # HTTP headers with custom User-Agent for requests
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"  # base URL for team-instance printable schedule
GAME_BASE = "https://www.csyba.com/game/show/{gid}"  # base URL for game detail page

# Regular expressions for parsing scores, game links, and time strings
SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)

# ----------------- helpers -----------------
def clean(x: str) -> str:
    """Normalize whitespace and strip input string."""
    return re.sub(r"\s+", " ", (x or "")).strip()

def slugify(s: str) -> str:
    """Convert string to lowercase slug with words separated by hyphens."""
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s

def norm_name(s: str) -> str:
    """
    Normalize team names by lowercasing, removing common words like 'the', 'club',
    and stripping punctuation, to help with loose matching.
    """
    s = s.lower()
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

@dataclass(frozen=True)
class TeamRec:
    """Data class representing a team record with identifying information."""
    name: str
    slug: str
    team_id: str
    instance_id: str
    subseason_id: str

def load_teams(teams_path: str):
    """
    Load team mapping data from JSON file.
    Returns dictionaries keyed by instance_id, slug, and normalized names for lookups.
    """
    with open(teams_path, "r", encoding="utf-8") as f:
        arr = json.load(f)
    by_instance: Dict[str, TeamRec] = {}
    by_slug: Dict[str, TeamRec] = {}
    by_norm: Dict[str, TeamRec] = {}
    for t in arr:
        rec = TeamRec(
            name=str(t["teamName"]),
            slug=str(t["team_slug"]),
            team_id=str(t["team_id"]),
            instance_id=str(t["instance_id"]),
            subseason_id=str(t["subseason_id"]),
        )
        by_instance[rec.instance_id] = rec
        by_slug[rec.slug] = rec
        by_norm[norm_name(rec.name)] = rec
    return by_instance, by_slug, by_norm

def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
    """
    Attempt to match the opponent team name to a known team record.
    Tries slug first, then normalized name exact match,
    then loose containment matching on normalized names.
    """
    s = slugify(opponent_text)
    if s in by_slug:
        return by_slug[s]
    n = norm_name(opponent_text)
    if n in by_norm:
        return by_norm[n]
    for key, rec in by_norm.items():
        if key in n or n in key:
            return rec
    return None

def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
    """
    Parse runs scored by team and opponent, assuming team-first order.
    Validate results with result_flag (W/L/T).
    """
    if not (s_a.isdigit() and s_b.isdigit()):
        return None, None
    a, b = int(s_a), int(s_b)
    if result_flag == "W" and a <= b:
        logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
    if result_flag == "L" and a >= b:
        logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
    return a, b

# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
    """
    Fetch a URL and return a BeautifulSoup parsed document.
    Uses a shared requests.Session if provided.
    """
    try:
        sess = session or requests.Session()
        r = sess.get(url, headers=HEADERS, timeout=timeout)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        logging.error(f"GET failed {url}: {e}")
        return None

# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
    """
    Download and parse the team-instance printable schedule page,
    extracting a list of game dictionaries from the perspective of that team.
    """
    url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
        "schedule_type": "index",
        "subseason": subseason_id,
    })
    soup = get_soup(url, session=session)
    if not soup:
        return []

    table = soup.select_one("table")
    if not table:
        logging.warning(f"No table found for team_instance={instance_id}")
        return []

    games = []
    # Skip header row; iterate over game rows
    for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
        tds = tr.select("td")
        if len(tds) < 5:
            continue

        # Extract text from each relevant cell:
        # Date | Result | Opponent | Location | Status
        date_txt   = clean(tds[0].get_text(" "))
        result_txt = clean(tds[1].get_text(" "))
        opp_txt    = clean(tds[2].get_text(" "))
        loc_txt    = clean(tds[3].get_text(" "))
        status_txt = clean(tds[4].get_text(" "))

        # Parse date into ISO format (YYYY-MM-DD) if possible
        try:
            date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
        except Exception:
            date_iso = date_txt  # leave raw if parsing fails

        # Find game ID from any game/show links in the row, if present
        game_id = ""
        for a in tr.select("a[href]"):
            m = GAME_LINK_RE.search(a.get("href", ""))
            if m:
                game_id = m.group(1)
                break

        # Extract W/L/T indicator from Result cell
        m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
        result_flag = m_res.group(1).upper() if m_res else ""

        # Extract numeric scores from Result or Opponent cell
        m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
        s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")

        # Determine if game is away based on '@' prefix in opponent cell
        is_away = opp_txt.startswith("@")
        opponent_name = opp_txt.lstrip("@").strip()

        # Convert scores to integers with team-first orientation
        team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)

        logging.debug(
            f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
            f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
            f"→ team_runs={team_runs}, opp_runs={opp_runs}"
        )

        games.append({
            "team_instance": instance_id,
            "game_id": game_id,              # may be empty
            "date": date_iso,
            "result": result_flag,           # W/L/T from THIS TEAM's perspective
            "team_runs": team_runs,
            "opp_runs": opp_runs,
            "opponent_name": opponent_name,
            "is_away": is_away,
            "location": loc_txt,
            "status": status_txt,
            "source_url": url,
        })

    logging.info(f"Team {instance_id}: parsed {len(games)} rows")
    return games

def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
    """
    Fetch the start time of a game from its detail page.
    Looks inside the boxscores tab or scans text for time patterns.
    Returns a 24-hour formatted 'HH:MM' string or None if not found.
    """
    if not game_id:
        return None
    url = GAME_BASE.format(gid=game_id)
    soup = get_soup(url, session=session, timeout=30)
    if not soup:
        return None

    # Prefer boxscores tab content to search for time string
    box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
    text = ""
    if box:
        text = " ".join(box.stripped_strings)
    else:
        # Fall back to main page text with length limit to prevent excessive text processing
        main = soup.select_one("div.page") or soup
        text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())

    m = TIME_RE.search(text)
    if not m:
        logging.debug(f"TIME: no time found in game {game_id}")
        return None

    hhmm = m.group(1)
    ampm = (m.group(2) or "").lower().replace(".", "")
    try:
        # Normalize time to 24h format
        from datetime import datetime
        if ampm:
            dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
        else:
            dt = datetime.strptime(hhmm, "%H:%M")
        return dt.strftime("%H:%M")
    except Exception:
        # Try forgiving parse if combined time/ampm without space
        try:
            from datetime import datetime
            if ampm:
                dt = datetime.strptime(f"{hhmm}{ampm}", "%I:%M%p")
                return dt.strftime("%H:%M")
        except Exception:
            logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
            return None

# ----------------- build & merge -----------------
def main(
    subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"),
    teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"),
    out: str = typer.Option("season_schedule.csv", help="Output CSV path"),
    fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/<id>"),
    sleep: float = typer.Option(0.35, help="Delay between requests (seconds)")
):
    """
    Main function to scrape schedules for all teams, merge them,
    deduplicate entries (primary by game_id), and output a consolidated CSV.
    Optionally fetches start times per game.
    """
    # Load teams data and indexes
    by_instance, by_slug, by_norm = load_teams(teams)
    instance_ids = sorted(by_instance.keys())

    # Requests session with custom headers
    session = requests.Session()
    session.headers.update(HEADERS)

    # Scrape all team instance printable schedules
    raw: List[dict] = []
    for i, iid in enumerate(instance_ids, 1):
        logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
        raw.extend(parse_printable(iid, subseason, session=session))
        time.sleep(sleep)  # be polite

    # Helper lookups for team records
    def rec_from_instance(iid: str) -> Optional[TeamRec]:
        return by_instance.get(iid)

    def match_opponent(text: str) -> Optional[TeamRec]:
        return best_match_team(text, by_slug, by_norm)

    # Deduplicate buckets keyed by game_id or fallback composite keys
    buckets: Dict[str, dict] = {}
    fallback_rows = 0

    for row in raw:
        team_rec = rec_from_instance(row["team_instance"])
        if not team_rec:
            logging.warning(f"Unknown instance {row['team_instance']}; skipping")
            continue

        opp_rec = match_opponent(row["opponent_name"])
        opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
        pair = tuple(sorted([team_rec.slug, opp_slug]))

        if row["game_id"]:
            key = f"id:{row['game_id']}"
        else:
            runs_sig = ""
            if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
                runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
            key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
            fallback_rows += 1

        # Store perspective of one team's view of the game
        perspective = {
            "team": team_rec,
            "opp": opp_rec,  # may be None
            "is_away": row["is_away"],
            "team_runs": row["team_runs"],
            "opp_runs": row["opp_runs"],
            "location": row["location"],
            "status": row["status"],
            "source_url": row["source_url"],
            "pair": pair,
            "date": row["date"],
            "game_id": row["game_id"],
        }

        if key not in buckets:
            buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
        else:
            buckets[key]["persp"].append(perspective)

    if fallback_rows:
        logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")

    out_rows = []
    time_cache: Dict[str, Optional[str]] = {}  # cache game times to avoid re-fetching

    # Merge perspectives and produce consolidated rows
    for key, bucket in buckets.items():
        p = bucket["persp"]
        date = p[0]["date"]
        game_id = bucket.get("game_id", "")

        # Try to identify home and away perspectives
        p_home = next((x for x in p if x["is_away"] is False), None)
        p_away = next((x for x in p if x["is_away"] is True), None)

        # Home is the team who is not away, else fallback to the other team's opponent
        home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
        away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))

        def pack_team(rec: Optional[TeamRec], fallback_slug: str):
            """Pack team record to tuple or fallback to slug-based default values."""
            if rec:
                return rec.slug, rec.instance_id, rec.team_id, rec.name
            return fallback_slug, "", "", fallback_slug.replace("-", " ").title()

        # Attempt to get runs from home perspective
        home_runs = away_runs = None
        if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
            home_runs = p_home["team_runs"]
            away_runs = p_home["opp_runs"]
        # Otherwise try away perspective with reversed runs
        elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
            away_runs = p_away["team_runs"]
            home_runs = p_away["opp_runs"]

        # If runs still missing, guess from first perspective, adjusting for is_away
        if (home_runs is None or away_runs is None) and p:
            one = p[0]
            if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
                if one["is_away"]:
                    away_runs = one["team_runs"]
                    home_runs = one["opp_runs"]
                    away_team = one["team"]
                    home_team = one["opp"] if one["opp"] else home_team
                else:
                    home_runs = one["team_runs"]
                    away_runs = one["opp_runs"]
                    home_team = one["team"]
                    away_team = one["opp"] if one["opp"] else away_team

        # Fallback guesses for home and away slugs if team data missing
        guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
                               p_away["opp"].slug if p_away and p_away["opp"] else
                               p[0]["pair"][0])
        guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
                               p_home["opp"].slug if p_home and p_home["opp"] else
                               p[0]["pair"][1])

        home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
        away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)

        # Determine winner and loser slugs based on runs
        winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
        if isinstance(home_runs, int) and isinstance(away_runs, int):
            if home_runs > away_runs:
                winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
                loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
            elif away_runs > home_runs:
                winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
                loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id

        # Consolidate location and status from home or away perspectives
        loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
        status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
        source_urls = sorted({x["source_url"] for x in p})

        # Optionally fetch game start time
        time_local = ""
        if fetch_time and game_id:
            if game_id in time_cache:
                tval = time_cache[game_id]
            else:
                logging.debug(f"TIME: fetching game {game_id}")
                tval = fetch_game_time(game_id, session=session)
                time_cache[game_id] = tval
                # If no time found, wait longer before next request to be polite
                if tval is None:
                    time.sleep(min(sleep * 2, 1.0))
            if tval:
                time_local = tval

        logging.debug(
            f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
            f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
        )

        # Append consolidated game record for CSV output
        out_rows.append({
            "date_local": date,
            "time_local": time_local,
            "home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
            "away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
            "home_runs": "" if home_runs is None else home_runs,
            "away_runs": "" if away_runs is None else away_runs,
            "winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
            "loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
            "location": loc, "status": status,
            "game_id": game_id,
            "source_urls": " ".join(source_urls),
        })

    if not out_rows:
        logging.warning("No games produced.")
        return

    # Define CSV output columns
    fieldnames = [
        "date_local","time_local",
        "home_slug","home_instance","home_id","home_name",
        "away_slug","away_instance","away_id","away_name",
        "home_runs","away_runs",
        "winner_slug","winner_instance","winner_id",
        "loser_slug","loser_instance","loser_id",
        "location","status","game_id","source_urls",
    ]
    # Write consolidated game data to CSV
    with open(out, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in out_rows:
            w.writerow(r)

    logging.info(f"Wrote {len(out_rows)} games → {out}")

if __name__ == "__main__":
    typer.run(main)