|
| 1 | +import re |
1 | 2 | import requests
|
2 | 3 | from bs4 import BeautifulSoup
|
3 | 4 | from src.utils.constants import *
|
4 | 5 |
|
| 6 | +def clean_name(name): |
| 7 | + """Strip extra information from player names, keeping only first and last name.""" |
| 8 | + # try to match firstname, lastname format |
| 9 | + if ',' in name: |
| 10 | + match = re.match(r'^([^,]+),\s*(\w+)', name) |
| 11 | + if match: |
| 12 | + return f"{match.group(1)}, {match.group(2)}" |
| 13 | + else: |
| 14 | + match = re.match(r'^(\w+)\s+(\w+)', name) |
| 15 | + if match: |
| 16 | + return f"{match.group(1)} {match.group(2)}" |
| 17 | + |
| 18 | + # fallback for removing common extra characters |
| 19 | + cleaned = re.sub(r'\s*\([^)]*\).*$', '', name) |
| 20 | + cleaned = re.sub(r'\s*\d+.*$', '', cleaned) |
| 21 | + cleaned = cleaned.strip() |
| 22 | + return cleaned |
| 23 | + |
5 | 24 | def fetch_page(url):
|
6 | 25 | response = requests.get(url)
|
7 | 26 | return BeautifulSoup(response.text, 'html.parser')
|
@@ -150,16 +169,25 @@ def lacrosse_summary(box_score_section):
|
150 | 169 | scoring_rows = scoring_table.find(TAG_TBODY)
|
151 | 170 | if scoring_rows:
|
152 | 171 | for row in scoring_rows.find_all(TAG_TR):
|
| 172 | + team = row.find_all(TAG_TD)[1].find(TAG_IMG)[ATTR_ALT] |
153 | 173 | period = row.find_all(TAG_TD)[2].text.strip()
|
154 | 174 | time = row.find_all(TAG_TD)[3].text.strip()
|
155 |
| - scorer = row.find_all(TAG_TD)[4].text.strip() |
156 |
| - assist = row.find_all(TAG_TD)[5].text.strip() |
| 175 | + scorer = clean_name(row.find_all(TAG_TD)[4].text.strip()) |
| 176 | + assist = clean_name(row.find_all(TAG_TD)[5].text.strip()) |
157 | 177 | opp_score = row.find_all(TAG_TD)[7].text.strip()
|
158 | 178 | cor_score = row.find_all(TAG_TD)[6].text.strip()
|
| 179 | + |
| 180 | + if assist and assist != "Unassisted": |
| 181 | + desc = f"Scored by {scorer}, assisted by {assist}" |
| 182 | + else: |
| 183 | + desc = f"Scored by {scorer}" |
| 184 | + |
159 | 185 | summary.append({
|
| 186 | + 'team': team, |
160 | 187 | 'period': period,
|
161 | 188 | 'time': time,
|
162 | 189 | 'scorer': scorer,
|
| 190 | + 'description': desc, |
163 | 191 | 'assist': assist,
|
164 | 192 | 'cor_score': cor_score,
|
165 | 193 | 'opp_score': opp_score,
|
|
0 commit comments