Spaces:
Sleeping
Sleeping
Jon Solow
commited on
Commit
·
54505df
1
Parent(s):
a033f2f
Refactor player_news parsing to go player by player to rather than parsing separate lists
Browse files
src/queries/nbcsports/player_news.py
CHANGED
@@ -7,39 +7,33 @@ from typing import Mapping
|
|
7 |
NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
|
8 |
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
return
|
15 |
|
16 |
|
17 |
-
def
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
|
25 |
url = f"{NEWS_URL}?p={page_number}"
|
26 |
request_page = requests.get(url)
|
27 |
soup = BeautifulSoup(request_page.content)
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
datetime_div_list = soup.find_all("div", {"class": "PlayerNewsPost-date"})
|
34 |
-
assert datetime_div_list
|
35 |
-
datetime_list = [x["data-date"] for x in datetime_div_list]
|
36 |
-
assert (
|
37 |
-
len(player_names_list) == len(team_abbr_list) == len(position_list) == len(headline_list) == len(analysis_list)
|
38 |
-
)
|
39 |
-
df = pd.DataFrame(
|
40 |
-
zip(datetime_list, player_names_list, team_abbr_list, position_list, headline_list, analysis_list),
|
41 |
-
columns=["Date/Time", "Name", "Team", "Position", "Headline", "Analysis"],
|
42 |
-
)
|
43 |
df["Date/Time"] = pd.to_datetime(df["Date/Time"])
|
44 |
return df
|
45 |
|
|
|
7 |
NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
|
8 |
|
9 |
|
10 |
+
def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
|
11 |
+
find_result = soup.find(element, find_search_map)
|
12 |
+
if not find_result:
|
13 |
+
return ""
|
14 |
+
return find_result.text.strip()
|
15 |
|
16 |
|
17 |
+
def parse_player_div(player_div):
|
18 |
+
return {
|
19 |
+
"Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
|
20 |
+
"Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
|
21 |
+
"Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
|
22 |
+
"Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
|
23 |
+
"Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
|
24 |
+
"Analysis":find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
|
25 |
+
}
|
26 |
|
27 |
|
28 |
def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
|
29 |
url = f"{NEWS_URL}?p={page_number}"
|
30 |
request_page = requests.get(url)
|
31 |
soup = BeautifulSoup(request_page.content)
|
32 |
+
player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
|
33 |
+
if not player_div_list:
|
34 |
+
return pd.DataFrame()
|
35 |
+
parsed_player_list = [parse_player_div(d) for d in player_div_list]
|
36 |
+
df = pd.DataFrame(parsed_player_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
df["Date/Time"] = pd.to_datetime(df["Date/Time"])
|
38 |
return df
|
39 |
|