Files
csyba/csyba.py
2025-08-27 11:23:48 -05:00

93 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests, re, time, csv, logging
from bs4 import BeautifulSoup
from dateutil import parser as dtp
# --- Logging setup ---
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S"
)
HEADERS = {"User-Agent": "Mozilla/5.0"}
SUBSEASON_ID = "942425"
TEAM_INSTANCES = [
"10119604","10119605","10119601","10119603","10119599","10185021","10119607",
"10219990","10119600","10119602","10119611","10119616","10119612","10148204",
"10147713","10119617","10178191","10119608","10119615","10119614","10168648",
"10168644","10168645","10168646","10168649"
]
def clean(x):
return re.sub(r"\s+"," ",x or "").strip()
def fetch_team_schedule(iid):
url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}"
try:
r = requests.get(url, headers=HEADERS, timeout=30)
r.raise_for_status()
except Exception as e:
logging.error(f"Failed to fetch team {iid}: {e}")
return []
soup = BeautifulSoup(r.text,"html.parser")
games = []
for tr in soup.select("table tr")[1:]: # skip header
tds = tr.select("td")
if len(tds) < 5:
continue
date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]]
# parse date
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except:
date_iso = date_txt
# extract result/score
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result = m_res.group(1).upper() if m_res else ""
m_score = re.search(r"(\d+)\s*[-]\s*(\d+)", result_txt)
hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","")
away_flag = opp_txt.startswith("@")
opponent = opp_txt.lstrip("@").strip()
games.append({
"team_instance": iid,
"date": date_iso,
"result": result,
"score": f"{hs}-{as_}" if hs else "",
"home_score": hs,
"away_score": as_,
"opponent": opponent,
"is_away": away_flag,
"location": loc_txt,
"status": status_txt,
"source_url": url
})
logging.info(f"Team {iid}: parsed {len(games)} games")
return games
def main():
all_games = []
for i, iid in enumerate(TEAM_INSTANCES, start=1):
logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}")
all_games.extend(fetch_team_schedule(iid))
time.sleep(0.5)
# deduplicate: key = (date, sorted team_instance+opponent, score)
unique = {}
for g in all_games:
key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"])
if key not in unique:
unique[key] = g
deduped_games = list(unique.values())
out_file = "season_games.csv"
with open(out_file,"w",newline="",encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys())
writer.writeheader()
writer.writerows(deduped_games)
logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}")
if __name__ == "__main__":
main()