Jon Solow commited on
Commit
54505df
·
1 Parent(s): a033f2f

Refactor player_news parsing to go player by player to rather than parsing separate lists

Browse files
Files changed (1) hide show
  1. src/queries/nbcsports/player_news.py +19 -25
src/queries/nbcsports/player_news.py CHANGED
@@ -7,39 +7,33 @@ from typing import Mapping
7
  NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
8
 
9
 
10
- def get_text_from_find_all(soup, element: str, find_search_map: Mapping[str, str]):
11
- find_list = soup.find_all(element, find_search_map)
12
- assert find_list
13
- text_list = [x.text.strip() for x in find_list]
14
- return text_list
15
 
16
 
17
- def get_team_names(soup):
18
- player_team_div_list = soup.find_all("div", "PlayerNewsPost-team")
19
- player_team_abbr_span_list = [x.find("span", {"class": "PlayerNewsPost-team-abbr"}) for x in player_team_div_list]
20
- player_team_abbr_list = [x.text.upper() if x else "" for x in player_team_abbr_span_list]
21
- return player_team_abbr_list
 
 
 
 
22
 
23
 
24
  def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
25
  url = f"{NEWS_URL}?p={page_number}"
26
  request_page = requests.get(url)
27
  soup = BeautifulSoup(request_page.content)
28
- player_names_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-name"})
29
- team_abbr_list = get_team_names(soup)
30
- position_list = get_text_from_find_all(soup, "span", {"class": "PlayerNewsPost-position"})
31
- headline_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-headline"})
32
- analysis_list = get_text_from_find_all(soup, "div", {"class": "PlayerNewsPost-analysis"})
33
- datetime_div_list = soup.find_all("div", {"class": "PlayerNewsPost-date"})
34
- assert datetime_div_list
35
- datetime_list = [x["data-date"] for x in datetime_div_list]
36
- assert (
37
- len(player_names_list) == len(team_abbr_list) == len(position_list) == len(headline_list) == len(analysis_list)
38
- )
39
- df = pd.DataFrame(
40
- zip(datetime_list, player_names_list, team_abbr_list, position_list, headline_list, analysis_list),
41
- columns=["Date/Time", "Name", "Team", "Position", "Headline", "Analysis"],
42
- )
43
  df["Date/Time"] = pd.to_datetime(df["Date/Time"])
44
  return df
45
 
 
7
  NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
8
 
9
 
10
+ def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
11
+ find_result = soup.find(element, find_search_map)
12
+ if not find_result:
13
+ return ""
14
+ return find_result.text.strip()
15
 
16
 
17
+ def parse_player_div(player_div):
18
+ return {
19
+ "Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
20
+ "Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
21
+ "Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
22
+ "Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
23
+ "Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
24
+ "Analysis":find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
25
+ }
26
 
27
 
28
  def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
29
  url = f"{NEWS_URL}?p={page_number}"
30
  request_page = requests.get(url)
31
  soup = BeautifulSoup(request_page.content)
32
+ player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
33
+ if not player_div_list:
34
+ return pd.DataFrame()
35
+ parsed_player_list = [parse_player_div(d) for d in player_div_list]
36
+ df = pd.DataFrame(parsed_player_list)
 
 
 
 
 
 
 
 
 
 
37
  df["Date/Time"] = pd.to_datetime(df["Date/Time"])
38
  return df
39