File size: 2,049 Bytes
c412d07
 
 
 
 
 
 
 
 
54505df
 
 
 
 
c412d07
 
54505df
 
 
 
 
 
 
a29184a
54505df
958ade1
 
c412d07
 
 
 
54505df
 
 
 
 
c412d07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import requests
from typing import Mapping

NEWS_URL = "https://www.nbcsports.com/fantasy/football/player-news"


def find_soup_text_with_default(soup, element: str, find_search_map: Mapping[str, str]):
    find_result = soup.find(element, find_search_map)
    if not find_result:
        return ""
    return find_result.text.strip()


def parse_player_div(player_div):
    return {
        "Date/Time": player_div.find("div", {"class": "PlayerNewsPost-date"}).get("data-date"),
        "Name": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-name"}),
        "Team": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-team-abbr"}).upper(),
        "Position": find_soup_text_with_default(player_div, "span", {"class": "PlayerNewsPost-position"}).title(),
        "Headline": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-headline"}),
        "Analysis": find_soup_text_with_default(player_div, "div", {"class": "PlayerNewsPost-analysis"}),
    }


def get_nfl_player_news(page_number: int = 1) -> pd.DataFrame:
    url = f"{NEWS_URL}?p={page_number}"
    request_page = requests.get(url)
    soup = BeautifulSoup(request_page.content)
    player_div_list = soup.find_all("div", {"class": "PlayerNewsPost"})
    if not player_div_list:
        return pd.DataFrame()
    parsed_player_list = [parse_player_div(d) for d in player_div_list]
    df = pd.DataFrame(parsed_player_list)
    df["Date/Time"] = pd.to_datetime(df["Date/Time"])
    return df


def get_player_news_window_hours(hours: int = 1):
    end_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(hours=hours)
    page = 1
    max_pages = 20
    date_reached = False
    df_list = []
    while page < max_pages and not date_reached:
        last_news = get_nfl_player_news(page)
        df_list.append(last_news)
        date_reached = min(last_news["Date/Time"]) < end_date
        page += 1
    return pd.concat(df_list)