Jon Solow
Apply ruff format
a29184a
raw
history blame
2.05 kB
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import requests
from typing import Mapping
NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"
def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
find_result = soup.find(element, find_search_map)
if not find_result:
return ""
return find_result.text.strip()
def parse_player_div(player_div):
return {
"Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
"Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
"Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
"Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
"Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
"Analysis": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
}
def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
url = f"{NEWS_URL}?p={page_number}"
request_page = requests.get(url)
soup = BeautifulSoup(request_page.content)
player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
if not player_div_list:
return pd.DataFrame()
parsed_player_list = [parse_player_div(d) for d in player_div_list]
df = pd.DataFrame(parsed_player_list)
df["Date/Time"] = pd.to_datetime(df["Date/Time"])
return df
def get_player_news_window_hours(hours: int = 1):
end_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=hours)
page = 1
max_pages = 20
date_reached = False
df_list = []
while page < max_pages and not date_reached:
last_news = get_nfl_player_news(page)
df_list.append(last_news)
date_reached = min(last_news["Date/Time"]) < end_date
page += 1
return pd.concat(df_list)