Spaces:

tensora
/

webcrawler

Running

App Files Files Community

Add1E commited on Mar 18

Commit

bff03e4

verified ·

1 Parent(s): 4102ffa

Delete pytrends/dailydata.py

Browse files

Files changed (1) hide show

pytrends/dailydata.py +0 -127

pytrends/dailydata.py DELETED Viewed

@@ -1,127 +0,0 @@
-from datetime import date, timedelta
-from functools import partial
-from time import sleep
-from calendar import monthrange
-import pandas as pd
-from pytrends.exceptions import ResponseError
-from pytrends.request import TrendReq
-def get_last_date_of_month(year: int, month: int) -> date:
-    """Given a year and a month returns an instance of the date class
-    containing the last day of the corresponding month.
-    Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
-    """
-    return date(year, month, monthrange(year, month)[1])
-def convert_dates_to_timeframe(start: date, stop: date) -> str:
-    """Given two dates, returns a stringified version of the interval between
-    the two dates which is used to retrieve data for a specific time frame
-    from Google Trends.
-    """
-    return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
-def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
-    """Attempts to fecth data and retries in case of a ResponseError."""
-    attempts, fetched = 0, False
-    while not fetched:
-        try:
-            build_payload(timeframe=timeframe)
-        except ResponseError as err:
-            print(err)
-            print(f'Trying again in {60 + 5 * attempts} seconds.')
-            sleep(60 + 5 * attempts)
-            attempts += 1
-            if attempts > 3:
-                print('Failed after 3 attemps, abort fetching.')
-                break
-        else:
-            fetched = True
-    return pytrends.interest_over_time()
-def get_daily_data(word: str,
-                 start_year: int,
-                 start_mon: int,
-                 stop_year: int,
-                 stop_mon: int,
-                 geo: str = 'US',
-                 verbose: bool = True,
-                 wait_time: float = 5.0) -> pd.DataFrame:
-    """Given a word, fetches daily search volume data from Google Trends and
-    returns results in a pandas DataFrame.
-    Details: Due to the way Google Trends scales and returns data, special
-    care needs to be taken to make the daily data comparable over different
-    months. To do that, we download daily data on a month by month basis,
-    and also monthly data. The monthly data is downloaded in one go, so that
-    the monthly values are comparable amongst themselves and can be used to
-    scale the daily data. The daily data is scaled by multiplying the daily
-    value by the monthly search volume divided by 100.
-    For a more detailed explanation see http://bit.ly/trendsscaling
-    Args:
-        word (str): Word to fetch daily data for.
-        start_year (int): the start year
-        start_mon (int): start 1st day of the month
-        stop_year (int): the end year
-        stop_mon (int): end at the last day of the month
-        geo (str): geolocation
-        verbose (bool): If True, then prints the word and current time frame
-            we are fecthing the data for.
-    Returns:
-        complete (pd.DataFrame): Contains 4 columns.
-            The column named after the word argument contains the daily search
-            volume already scaled and comparable through time.
-            The column f'{word}_unscaled' is the original daily data fetched
-            month by month, and it is not comparable across different months
-            (but is comparable within a month).
-            The column f'{word}_monthly' contains the original monthly data
-            fetched at once. The values in this column have been backfilled
-            so that there are no NaN present.
-            The column 'scale' contains the scale used to obtain the scaled
-            daily data.
-    """
-    # Set up start and stop dates
-    start_date = date(start_year, start_mon, 1)
-    stop_date = get_last_date_of_month(stop_year, stop_mon)
-    # Start pytrends for US region
-    pytrends = TrendReq(hl='en-US', tz=360)
-    # Initialize build_payload with the word we need data for
-    build_payload = partial(pytrends.build_payload,
-                            kw_list=[word], cat=0, geo=geo, gprop='')
-    # Obtain monthly data for all months in years [start_year, stop_year]
-    monthly = _fetch_data(pytrends, build_payload,
-                         convert_dates_to_timeframe(start_date, stop_date))
-    # Get daily data, month by month
-    results = {}
-    # if a timeout or too many requests error occur we need to adjust wait time
-    current = start_date
-    while current < stop_date:
-        last_date_of_month = get_last_date_of_month(current.year, current.month)
-        timeframe = convert_dates_to_timeframe(current, last_date_of_month)
-        if verbose:
-            print(f'{word}:{timeframe}')
-        results[current] = _fetch_data(pytrends, build_payload, timeframe)
-        current = last_date_of_month + timedelta(days=1)
-        sleep(wait_time)  # don't go too fast or Google will send 429s
-    daily = pd.concat(results.values()).drop(columns=['isPartial'])
-    complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
-    # Scale daily data by monthly weights so the data is comparable
-    complete[f'{word}_monthly'].ffill(inplace=True)  # fill NaN values
-    complete['scale'] = complete[f'{word}_monthly'] / 100
-    complete[word] = complete[f'{word}_unscaled'] * complete.scale
-    return complete