Add1E commited on
Commit
bff03e4
·
verified ·
1 Parent(s): 4102ffa

Delete pytrends/dailydata.py

Browse files
Files changed (1) hide show
  1. pytrends/dailydata.py +0 -127
pytrends/dailydata.py DELETED
@@ -1,127 +0,0 @@
1
- from datetime import date, timedelta
2
- from functools import partial
3
- from time import sleep
4
- from calendar import monthrange
5
-
6
- import pandas as pd
7
-
8
- from pytrends.exceptions import ResponseError
9
- from pytrends.request import TrendReq
10
-
11
-
12
- def get_last_date_of_month(year: int, month: int) -> date:
13
- """Given a year and a month returns an instance of the date class
14
- containing the last day of the corresponding month.
15
-
16
- Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
17
- """
18
- return date(year, month, monthrange(year, month)[1])
19
-
20
-
21
- def convert_dates_to_timeframe(start: date, stop: date) -> str:
22
- """Given two dates, returns a stringified version of the interval between
23
- the two dates which is used to retrieve data for a specific time frame
24
- from Google Trends.
25
- """
26
- return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"
27
-
28
-
29
- def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
30
- """Attempts to fecth data and retries in case of a ResponseError."""
31
- attempts, fetched = 0, False
32
- while not fetched:
33
- try:
34
- build_payload(timeframe=timeframe)
35
- except ResponseError as err:
36
- print(err)
37
- print(f'Trying again in {60 + 5 * attempts} seconds.')
38
- sleep(60 + 5 * attempts)
39
- attempts += 1
40
- if attempts > 3:
41
- print('Failed after 3 attemps, abort fetching.')
42
- break
43
- else:
44
- fetched = True
45
- return pytrends.interest_over_time()
46
-
47
-
48
- def get_daily_data(word: str,
49
- start_year: int,
50
- start_mon: int,
51
- stop_year: int,
52
- stop_mon: int,
53
- geo: str = 'US',
54
- verbose: bool = True,
55
- wait_time: float = 5.0) -> pd.DataFrame:
56
- """Given a word, fetches daily search volume data from Google Trends and
57
- returns results in a pandas DataFrame.
58
-
59
- Details: Due to the way Google Trends scales and returns data, special
60
- care needs to be taken to make the daily data comparable over different
61
- months. To do that, we download daily data on a month by month basis,
62
- and also monthly data. The monthly data is downloaded in one go, so that
63
- the monthly values are comparable amongst themselves and can be used to
64
- scale the daily data. The daily data is scaled by multiplying the daily
65
- value by the monthly search volume divided by 100.
66
- For a more detailed explanation see http://bit.ly/trendsscaling
67
-
68
- Args:
69
- word (str): Word to fetch daily data for.
70
- start_year (int): the start year
71
- start_mon (int): start 1st day of the month
72
- stop_year (int): the end year
73
- stop_mon (int): end at the last day of the month
74
- geo (str): geolocation
75
- verbose (bool): If True, then prints the word and current time frame
76
- we are fecthing the data for.
77
-
78
- Returns:
79
- complete (pd.DataFrame): Contains 4 columns.
80
- The column named after the word argument contains the daily search
81
- volume already scaled and comparable through time.
82
- The column f'{word}_unscaled' is the original daily data fetched
83
- month by month, and it is not comparable across different months
84
- (but is comparable within a month).
85
- The column f'{word}_monthly' contains the original monthly data
86
- fetched at once. The values in this column have been backfilled
87
- so that there are no NaN present.
88
- The column 'scale' contains the scale used to obtain the scaled
89
- daily data.
90
- """
91
-
92
- # Set up start and stop dates
93
- start_date = date(start_year, start_mon, 1)
94
- stop_date = get_last_date_of_month(stop_year, stop_mon)
95
-
96
- # Start pytrends for US region
97
- pytrends = TrendReq(hl='en-US', tz=360)
98
- # Initialize build_payload with the word we need data for
99
- build_payload = partial(pytrends.build_payload,
100
- kw_list=[word], cat=0, geo=geo, gprop='')
101
-
102
- # Obtain monthly data for all months in years [start_year, stop_year]
103
- monthly = _fetch_data(pytrends, build_payload,
104
- convert_dates_to_timeframe(start_date, stop_date))
105
-
106
- # Get daily data, month by month
107
- results = {}
108
- # if a timeout or too many requests error occur we need to adjust wait time
109
- current = start_date
110
- while current < stop_date:
111
- last_date_of_month = get_last_date_of_month(current.year, current.month)
112
- timeframe = convert_dates_to_timeframe(current, last_date_of_month)
113
- if verbose:
114
- print(f'{word}:{timeframe}')
115
- results[current] = _fetch_data(pytrends, build_payload, timeframe)
116
- current = last_date_of_month + timedelta(days=1)
117
- sleep(wait_time) # don't go too fast or Google will send 429s
118
-
119
- daily = pd.concat(results.values()).drop(columns=['isPartial'])
120
- complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')
121
-
122
- # Scale daily data by monthly weights so the data is comparable
123
- complete[f'{word}_monthly'].ffill(inplace=True) # fill NaN values
124
- complete['scale'] = complete[f'{word}_monthly'] / 100
125
- complete[word] = complete[f'{word}_unscaled'] * complete.scale
126
-
127
- return complete