Spaces:
Running
Running
Delete pytrends/dailydata.py
Browse files- pytrends/dailydata.py +0 -127
pytrends/dailydata.py
DELETED
@@ -1,127 +0,0 @@
|
|
1 |
-
from datetime import date, timedelta
|
2 |
-
from functools import partial
|
3 |
-
from time import sleep
|
4 |
-
from calendar import monthrange
|
5 |
-
|
6 |
-
import pandas as pd
|
7 |
-
|
8 |
-
from pytrends.exceptions import ResponseError
|
9 |
-
from pytrends.request import TrendReq
|
10 |
-
|
11 |
-
|
12 |
-
def get_last_date_of_month(year: int, month: int) -> date:
|
13 |
-
"""Given a year and a month returns an instance of the date class
|
14 |
-
containing the last day of the corresponding month.
|
15 |
-
|
16 |
-
Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
|
17 |
-
"""
|
18 |
-
return date(year, month, monthrange(year, month)[1])
|
19 |
-
|
20 |
-
|
21 |
-
def convert_dates_to_timeframe(start: date, stop: date) -> str:
|
22 |
-
"""Given two dates, returns a stringified version of the interval between
|
23 |
-
the two dates which is used to retrieve data for a specific time frame
|
24 |
-
from Google Trends.
|
25 |
-
"""
|
26 |
-
return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
|
27 |
-
|
28 |
-
|
29 |
-
def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
|
30 |
-
"""Attempts to fecth data and retries in case of a ResponseError."""
|
31 |
-
attempts, fetched = 0, False
|
32 |
-
while not fetched:
|
33 |
-
try:
|
34 |
-
build_payload(timeframe=timeframe)
|
35 |
-
except ResponseError as err:
|
36 |
-
print(err)
|
37 |
-
print(f'Trying again in {60 + 5 * attempts} seconds.')
|
38 |
-
sleep(60 + 5 * attempts)
|
39 |
-
attempts += 1
|
40 |
-
if attempts > 3:
|
41 |
-
print('Failed after 3 attemps, abort fetching.')
|
42 |
-
break
|
43 |
-
else:
|
44 |
-
fetched = True
|
45 |
-
return pytrends.interest_over_time()
|
46 |
-
|
47 |
-
|
48 |
-
def get_daily_data(word: str,
|
49 |
-
start_year: int,
|
50 |
-
start_mon: int,
|
51 |
-
stop_year: int,
|
52 |
-
stop_mon: int,
|
53 |
-
geo: str = 'US',
|
54 |
-
verbose: bool = True,
|
55 |
-
wait_time: float = 5.0) -> pd.DataFrame:
|
56 |
-
"""Given a word, fetches daily search volume data from Google Trends and
|
57 |
-
returns results in a pandas DataFrame.
|
58 |
-
|
59 |
-
Details: Due to the way Google Trends scales and returns data, special
|
60 |
-
care needs to be taken to make the daily data comparable over different
|
61 |
-
months. To do that, we download daily data on a month by month basis,
|
62 |
-
and also monthly data. The monthly data is downloaded in one go, so that
|
63 |
-
the monthly values are comparable amongst themselves and can be used to
|
64 |
-
scale the daily data. The daily data is scaled by multiplying the daily
|
65 |
-
value by the monthly search volume divided by 100.
|
66 |
-
For a more detailed explanation see http://bit.ly/trendsscaling
|
67 |
-
|
68 |
-
Args:
|
69 |
-
word (str): Word to fetch daily data for.
|
70 |
-
start_year (int): the start year
|
71 |
-
start_mon (int): start 1st day of the month
|
72 |
-
stop_year (int): the end year
|
73 |
-
stop_mon (int): end at the last day of the month
|
74 |
-
geo (str): geolocation
|
75 |
-
verbose (bool): If True, then prints the word and current time frame
|
76 |
-
we are fecthing the data for.
|
77 |
-
|
78 |
-
Returns:
|
79 |
-
complete (pd.DataFrame): Contains 4 columns.
|
80 |
-
The column named after the word argument contains the daily search
|
81 |
-
volume already scaled and comparable through time.
|
82 |
-
The column f'{word}_unscaled' is the original daily data fetched
|
83 |
-
month by month, and it is not comparable across different months
|
84 |
-
(but is comparable within a month).
|
85 |
-
The column f'{word}_monthly' contains the original monthly data
|
86 |
-
fetched at once. The values in this column have been backfilled
|
87 |
-
so that there are no NaN present.
|
88 |
-
The column 'scale' contains the scale used to obtain the scaled
|
89 |
-
daily data.
|
90 |
-
"""
|
91 |
-
|
92 |
-
# Set up start and stop dates
|
93 |
-
start_date = date(start_year, start_mon, 1)
|
94 |
-
stop_date = get_last_date_of_month(stop_year, stop_mon)
|
95 |
-
|
96 |
-
# Start pytrends for US region
|
97 |
-
pytrends = TrendReq(hl='en-US', tz=360)
|
98 |
-
# Initialize build_payload with the word we need data for
|
99 |
-
build_payload = partial(pytrends.build_payload,
|
100 |
-
kw_list=[word], cat=0, geo=geo, gprop='')
|
101 |
-
|
102 |
-
# Obtain monthly data for all months in years [start_year, stop_year]
|
103 |
-
monthly = _fetch_data(pytrends, build_payload,
|
104 |
-
convert_dates_to_timeframe(start_date, stop_date))
|
105 |
-
|
106 |
-
# Get daily data, month by month
|
107 |
-
results = {}
|
108 |
-
# if a timeout or too many requests error occur we need to adjust wait time
|
109 |
-
current = start_date
|
110 |
-
while current < stop_date:
|
111 |
-
last_date_of_month = get_last_date_of_month(current.year, current.month)
|
112 |
-
timeframe = convert_dates_to_timeframe(current, last_date_of_month)
|
113 |
-
if verbose:
|
114 |
-
print(f'{word}:{timeframe}')
|
115 |
-
results[current] = _fetch_data(pytrends, build_payload, timeframe)
|
116 |
-
current = last_date_of_month + timedelta(days=1)
|
117 |
-
sleep(wait_time) # don't go too fast or Google will send 429s
|
118 |
-
|
119 |
-
daily = pd.concat(results.values()).drop(columns=['isPartial'])
|
120 |
-
complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
|
121 |
-
|
122 |
-
# Scale daily data by monthly weights so the data is comparable
|
123 |
-
complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
|
124 |
-
complete['scale'] = complete[f'{word}_monthly'] / 100
|
125 |
-
complete[word] = complete[f'{word}_unscaled'] * complete.scale
|
126 |
-
|
127 |
-
return complete
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|