93 lines
3.2 KiB
Python
93 lines
3.2 KiB
Python
import requests, re, time, csv, logging
|
||
from bs4 import BeautifulSoup
|
||
from dateutil import parser as dtp
|
||
|
||
# --- Logging setup ---
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
datefmt="%H:%M:%S"
|
||
)
|
||
|
||
HEADERS = {"User-Agent": "Mozilla/5.0"}
|
||
SUBSEASON_ID = "942425"
|
||
|
||
TEAM_INSTANCES = [
|
||
"10119604","10119605","10119601","10119603","10119599","10185021","10119607",
|
||
"10219990","10119600","10119602","10119611","10119616","10119612","10148204",
|
||
"10147713","10119617","10178191","10119608","10119615","10119614","10168648",
|
||
"10168644","10168645","10168646","10168649"
|
||
]
|
||
|
||
def clean(x):
|
||
return re.sub(r"\s+"," ",x or "").strip()
|
||
|
||
def fetch_team_schedule(iid):
|
||
url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}"
|
||
try:
|
||
r = requests.get(url, headers=HEADERS, timeout=30)
|
||
r.raise_for_status()
|
||
except Exception as e:
|
||
logging.error(f"Failed to fetch team {iid}: {e}")
|
||
return []
|
||
|
||
soup = BeautifulSoup(r.text,"html.parser")
|
||
games = []
|
||
for tr in soup.select("table tr")[1:]: # skip header
|
||
tds = tr.select("td")
|
||
if len(tds) < 5:
|
||
continue
|
||
date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]]
|
||
# parse date
|
||
try:
|
||
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
|
||
except:
|
||
date_iso = date_txt
|
||
# extract result/score
|
||
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
|
||
result = m_res.group(1).upper() if m_res else ""
|
||
m_score = re.search(r"(\d+)\s*[-–]\s*(\d+)", result_txt)
|
||
hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","")
|
||
away_flag = opp_txt.startswith("@")
|
||
opponent = opp_txt.lstrip("@").strip()
|
||
games.append({
|
||
"team_instance": iid,
|
||
"date": date_iso,
|
||
"result": result,
|
||
"score": f"{hs}-{as_}" if hs else "",
|
||
"home_score": hs,
|
||
"away_score": as_,
|
||
"opponent": opponent,
|
||
"is_away": away_flag,
|
||
"location": loc_txt,
|
||
"status": status_txt,
|
||
"source_url": url
|
||
})
|
||
logging.info(f"Team {iid}: parsed {len(games)} games")
|
||
return games
|
||
|
||
def main():
|
||
all_games = []
|
||
for i, iid in enumerate(TEAM_INSTANCES, start=1):
|
||
logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}")
|
||
all_games.extend(fetch_team_schedule(iid))
|
||
time.sleep(0.5)
|
||
|
||
# deduplicate: key = (date, sorted team_instance+opponent, score)
|
||
unique = {}
|
||
for g in all_games:
|
||
key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"])
|
||
if key not in unique:
|
||
unique[key] = g
|
||
deduped_games = list(unique.values())
|
||
|
||
out_file = "season_games.csv"
|
||
with open(out_file,"w",newline="",encoding="utf-8") as f:
|
||
writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys())
|
||
writer.writeheader()
|
||
writer.writerows(deduped_games)
|
||
|
||
logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}")
|
||
|
||
if __name__ == "__main__":
|
||
main() |