Spaces:

jsolow
/

YFDashboard

Running

App Files Files Community

Jon Solow commited on Sep 21, 2023

Commit

54505df

1 Parent(s): a033f2f

Refactor player_news parsing to go player by player to rather than parsing separate lists

Browse files

Files changed (1) hide show

src/queries/nbcsports/player_news.py +19 -25

src/queries/nbcsports/player_news.py CHANGED Viewed

@@ -7,39 +7,33 @@ from typing import Mapping
 NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
-def get_text_from_find_all(soup, element: str, find_search_map: Mapping[str, str]):
-    find_list = soup.find_all(element, find_search_map)
-    assert find_list
-    text_list = [x.text.strip() for x in find_list]
-    return text_list
-def get_team_names(soup):
-    player_team_div_list = soup.find_all("div", "PlayerNewsPost-team")
-    player_team_abbr_span_list = [x.find("span", {"class": "PlayerNewsPost-team-abbr"}) for x in player_team_div_list]
-    player_team_abbr_list = [x.text.upper() if x else "" for x in player_team_abbr_span_list]
-    return player_team_abbr_list
 def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
     url = f"{NEWS_URL}?p={page_number}"
     request_page = requests.get(url)
     soup = BeautifulSoup(request_page.content)
-    player_names_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-name"})
-    team_abbr_list = get_team_names(soup)
-    position_list = get_text_from_find_all(soup, "span", {"class": "PlayerNewsPost-position"})
-    headline_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-headline"})
-    analysis_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-analysis"})
-    datetime_div_list = soup.find_all("div", {"class": "PlayerNewsPost-date"})
-    assert datetime_div_list
-    datetime_list = [x["data-date"] for x in datetime_div_list]
-    assert (
-        len(player_names_list) == len(team_abbr_list) == len(position_list) == len(headline_list) == len(analysis_list)
-    )
-    df = pd.DataFrame(
-        zip(datetime_list, player_names_list, team_abbr_list, position_list, headline_list, analysis_list),
-        columns=["Date/Time", "Name", "Team", "Position", "Headline", "Analysis"],
-    )
     df["Date/Time"] = pd.to_datetime(df["Date/Time"])
     return df

 NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
+def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
+    find_result = soup.find(element, find_search_map)
+    if not find_result:
+        return ""
+    return find_result.text.strip()
+def parse_player_div(player_div):
+    return {
+        "Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
+        "Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
+        "Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
+        "Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
+        "Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
+        "Analysis":find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
+    }
 def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
     url = f"{NEWS_URL}?p={page_number}"
     request_page = requests.get(url)
     soup = BeautifulSoup(request_page.content)
+    player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
+    if not player_div_list:
+        return pd.DataFrame()
+    parsed_player_list = [parse_player_div(d) for d in player_div_list]
+    df = pd.DataFrame(parsed_player_list)
     df["Date/Time"] = pd.to_datetime(df["Date/Time"])
     return df