kristada673 commited on
Commit
9014990
·
1 Parent(s): 2f8bae4

Upload 23 files

Browse files
finnlp/data_sources/news/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (174 Bytes). View file
 
finnlp/data_sources/news/__pycache__/_base.cpython-310.pyc ADDED
Binary file (1.36 kB). View file
 
finnlp/data_sources/news/__pycache__/finnhub_date_range.cpython-310.pyc ADDED
Binary file (5.66 kB). View file
 
finnlp/data_sources/news/_base.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources._base import FinNLP_Downloader
2
+
3
+ class News_Downloader(FinNLP_Downloader):
4
+
5
+ def __init__(self, args = {}):
6
+ super().__init__(args)
7
+ pass
8
+
9
+ def download_date_range(self, start_date, end_date, stock = None):
10
+ pass
11
+
12
+ def download_streaming(self, stock = None):
13
+ pass
14
+
15
+ def clean_data(self):
16
+ pass
17
+
18
+ def _gather_one_part(self, date, stock = None, delay = 0.1):
19
+ pass
20
+
21
+ def _gather_content(self):
22
+ pass
finnlp/data_sources/news/akshare_cctv.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import akshare as ak
3
+ from tqdm.notebook import tqdm
4
+ from finnlp.data_sources.news._base import News_Downloader
5
+
6
+
7
+ class Akshare_cctv(News_Downloader):
8
+
9
+ def __init__(self, args={}):
10
+ pass
11
+
12
+ def download_news(self, start_date, end_date, stock="all"):
13
+ self.date_list = pd.date_range(start_date, end_date)
14
+ res = pd.DataFrame()
15
+ for date in tqdm(self.date_list):
16
+ tmp = self.gather_one_day_news(date)
17
+ res = pd.concat([res, tmp])
18
+ self.dataframe = res
19
+
20
+ def clean_data(self):
21
+ pass
22
+
23
+ def gather_one_day_news(self, date, stock="all", delay=0.1):
24
+ date = self.transfer_standard_date_to_nonstandard(date)
25
+ res = ak.news_cctv(date=date)
26
+ return res
27
+
28
+ def transfer_standard_date_to_nonstandard(self, date):
29
+ return date.strftime("%Y%m%d")
finnlp/data_sources/news/alliancenews_streaming.py ADDED
File without changes
finnlp/data_sources/news/cnbc_streaming.py ADDED
File without changes
finnlp/data_sources/news/eastmoney_streaming.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from lxml import etree
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ from finnlp.data_sources.news._base import News_Downloader
6
+
7
+
8
+ class Eastmoney_Streaming(News_Downloader):
9
+
10
+ def __init__(self, args={}):
11
+ super().__init__(args)
12
+ self.dataframe = pd.DataFrame()
13
+
14
+ def download_streaming_stock(self, stock = "600519", rounds = 3):
15
+ print( "Geting pages: ", end = "")
16
+ if rounds > 0:
17
+ for r in range(rounds):
18
+ br = self._gather_pages(stock, r)
19
+ if br == "break":
20
+ break
21
+ else:
22
+ r = 1
23
+ error_count = 0
24
+ while 1:
25
+ br = self._gather_pages(stock, r)
26
+ if br == "break":
27
+ break
28
+ elif br == "Error":
29
+ error_count +=1
30
+ if error_count>10:
31
+ print("Connection Error")
32
+ r += 1
33
+ print( f"Get total {r+1} pages.")
34
+ self.dataframe = self.dataframe.reset_index(drop = True)
35
+
36
+ def _gather_pages(self, stock, page):
37
+ print( page, end = " ")
38
+ url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
39
+ headers = {
40
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
41
+ }
42
+
43
+ requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
44
+ s = requests.session()
45
+ s.keep_alive = False # 关闭多余连接
46
+
47
+ response = self._request_get(url, headers=headers)
48
+ if response.status_code != 200:
49
+ return "Error"
50
+
51
+ # gather the comtent of the first page
52
+ page = etree.HTML(response.text)
53
+ trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
54
+ have_one = False
55
+ for item in trs:
56
+ have_one = True
57
+ read_amount = item.xpath("./td[1]//text()")[0]
58
+ comments = item.xpath("./td[2]//text()")[0]
59
+ title = item.xpath("./td[3]/div/a//text()")[0]
60
+ content_link = item.xpath("./td[3]/div/a/@href")[0]
61
+ author = item.xpath("./td[4]//text()")[0]
62
+ time = item.xpath("./td[5]//text()")[0]
63
+ tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
64
+ columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
65
+ tmp.columns = columns
66
+ self.dataframe = pd.concat([self.dataframe, tmp])
67
+ #print(title)
68
+ if have_one == False:
69
+ return "break"
finnlp/data_sources/news/finnhub_date_range.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+
4
+ from finnlp.data_sources.news._base import News_Downloader
5
+
6
+ from tqdm import tqdm
7
+ from lxml import etree
8
+ import pandas as pd
9
+ import requests
10
+ import finnhub
11
+ import time
12
+ import json
13
+
14
+ class Finnhub_Date_Range(News_Downloader):
15
+ def __init__(self, args = {}):
16
+ super().__init__(args)
17
+ assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
18
+ self.finnhub_client = finnhub.Client(api_key=args["token"])
19
+
20
+ def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
21
+ self.date_list = pd.date_range(start_date,end_date)
22
+ self.dataframe = pd.DataFrame()
23
+
24
+ days_each_time = 4
25
+ date_list = self.date_list
26
+ # cal total lenth
27
+ if len(date_list)%days_each_time == 0:
28
+ total = len(date_list)//days_each_time
29
+ else:
30
+ total = len(date_list)//days_each_time+1
31
+
32
+ with tqdm(total=total, desc= "Downloading Titles") as bar:
33
+ while len(date_list):
34
+ tmp_date_list = date_list[:days_each_time]
35
+ date_list = date_list[days_each_time:]
36
+ tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
37
+ tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
38
+ res = self._gather_one_part(tmp_start_date,tmp_end_date,stock = stock )
39
+ self.dataframe = pd.concat([self.dataframe,res])
40
+ bar.update(1)
41
+
42
+ # res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
43
+ self.dataframe.datetime = pd.to_datetime(self.dataframe.datetime,unit = "s")
44
+ self.dataframe = self.dataframe.reset_index(drop = True)
45
+
46
+ def _gather_one_part(self, start_date, end_date, stock = "AAPL", delay = 1):
47
+ res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
48
+ time.sleep(delay)
49
+ return pd.DataFrame(res)
50
+
51
+ def gather_content(self, delay = 0.01):
52
+ pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
53
+ self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
54
+
55
+ def _gather_content_apply(self,x, pbar, delay = 0.01):
56
+ time.sleep(delay)
57
+ url = x.url
58
+ source = x.source
59
+ response = self._request_get(url = url)
60
+ # response = self._request_get(url= url, headers= headers)
61
+ pbar.update(1)
62
+ if response is None:
63
+ return "Connection Error"
64
+ else:
65
+ page = etree.HTML(response.text)
66
+
67
+ try:
68
+ # Yahoo Finance
69
+ if source == "Yahoo":
70
+ page = page.xpath("/html/body/div[3]/div[1]/div/main/div[1]/div/div/div/div/article/div/div/div/div/div/div[2]/div[4]")
71
+ content = page[0].xpath(".//text()")
72
+ content = "\n".join(content)
73
+ return content
74
+
75
+ # Reuters
76
+ elif source == "Reuters":
77
+ page = page.xpath("/html/body/div[1]/div[3]/div/main/article/div[1]/div[2]/div/div/div[2]")
78
+ content = page[0].xpath(".//text()")
79
+ content = "\n".join(content)
80
+ return content
81
+
82
+ # SeekingAlpha
83
+ elif source == "SeekingAlpha":
84
+ page = page.xpath("/html/body/div[2]/div/div[1]/main/div/div[2]/div/article/div/div/div[2]/div/section[1]/div/div/div")
85
+ content = page[0].xpath(".//text()")
86
+ content = "\n".join(content)
87
+ return content
88
+
89
+ # PennyStocks
90
+ elif source == "PennyStocks":
91
+ page = page.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/article/div[2]/div[2]/div")
92
+ content = page[0].xpath(".//text()")
93
+ content = "\n".join(content)
94
+ return content
95
+
96
+ # MarketWatch
97
+ elif source == "MarketWatch":
98
+ page = page.xpath('//*[@id="js-article__body"]')
99
+ content = page[0].xpath(".//text()")
100
+ content = "".join(content)
101
+ while " " in content:
102
+ content = content.replace(" ", " ")
103
+ while "\n \n"in content:
104
+ content = content.replace("\n \n", " ")
105
+ while "\n "in content:
106
+ content = content.replace("\n ", " ")
107
+ return content
108
+
109
+ # Seeking Alpha
110
+ elif source == "Seeking Alpha":
111
+ # first get Seeking Alpha URL
112
+ page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
113
+ url_new = page[0]
114
+ response = self._request_get(url= url_new)
115
+ if response is None:
116
+ return "Connection Error"
117
+ else:
118
+ page = etree.HTML(response.text)
119
+
120
+ content = page[0].xpath(".//text()")
121
+ content = "\n".join(content)
122
+ return content
123
+
124
+ # Alliance News
125
+ elif source == "Alliance News":
126
+ page = page.xpath('//*[@id="comtext"]')
127
+ content = page[0].xpath(".//text()")
128
+ content = [c for c in content if not str(c).startswith("\r\n")]
129
+ content = "\n".join(content)
130
+ return content
131
+
132
+ # Thefly.com
133
+ elif source == "Thefly.com":
134
+ page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
135
+ url_new = page[0]
136
+ response = self._request_get(url= url_new, verify= False)
137
+ if response is None:
138
+ return "Connection Error"
139
+ else:
140
+ page = etree.HTML(response.text)
141
+
142
+ page = page.xpath('/html/body/div[2]/div/div/div/div/div[2]/div[2]//text()')
143
+ # content = page[0].xpath(".//text()")
144
+ # content = [c for c in content if not str(c).startswith("\r\n")]
145
+ content = "\n".join(page)
146
+ content = content.replace("\r\n","")
147
+
148
+ return content
149
+
150
+ # TalkMarkets
151
+ elif source == "TalkMarkets":
152
+ return "Not supported yet"
153
+
154
+ # CNBC
155
+ elif source == "CNBC":
156
+ page = page.xpath('/html/body/div[3]/div/div[1]/div[3]/div/div/div/div[3]/div[1]/div[2]/div[3]//text()')
157
+ content = "\n".join(page)
158
+
159
+ return content
160
+
161
+ # GuruFocus
162
+ elif source == "GuruFocus":
163
+ page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
164
+ url_new = page[0]
165
+ response = self._request_get(url= url_new)
166
+ if response is None:
167
+ return "Connection Error"
168
+ else:
169
+ page = etree.HTML(response.text)
170
+
171
+ page = page.xpath('/html/body/div[1]/div/section/section/main/section/main/div[1]/div/div/div[1]/div[2]/div//text()')
172
+ page_new = []
173
+ for c in page:
174
+ while "\n" in c:
175
+ c = c.replace("\n","")
176
+ while " "in c:
177
+ c = c.replace(" ","")
178
+
179
+ page_new.append(c)
180
+
181
+ content = "\n".join(page_new)
182
+
183
+ return content
184
+
185
+ # InvestorPlace
186
+ elif source == "InvestorPlace":
187
+ page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
188
+ url_new = page[0]
189
+ response = self._request_get(url= url_new)
190
+ if response is None:
191
+ return "Connection Error"
192
+ else:
193
+ page = etree.HTML(response.text)
194
+ page = page.xpath('//script[@type="application/ld+json"]')[1]
195
+ content = page.xpath(".//text()")
196
+ content = json.loads(content[0])
197
+ content = content["articleBody"]
198
+
199
+ return content
200
+
201
+ # TipRanks
202
+ elif source == "TipRanks":
203
+ page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
204
+ url_new = page[0]
205
+ response = self._request_get(url= url_new)
206
+ if response is None:
207
+ return "Connection Error"
208
+ else:
209
+ page = etree.HTML(response.text)
210
+ # /html/body/div[1]/div[2]/div[5]/div[2]/div[2]/div/div[6]/div/article/p[1]/p
211
+ page = page.xpath('/html/body/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/div[6]//text()')
212
+ # content = page[0].xpath('.//text()')
213
+ page = [p.replace("\n","") for p in page]
214
+ content = "".join(page)
215
+ return content
216
+
217
+ else:
218
+ return "Not supported yet"
219
+
220
+ except:
221
+ return "Error"
222
+
finnlp/data_sources/news/fmp_streaming.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import pandas as pd
4
+ from tqdm.notebook import tqdm
5
+
6
+ df = pd.read_csv("NAS.csv", index_col=0)
7
+ stock_list = df.index.to_list()
8
+
9
+ api_key = YOUR_API_KEY # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys
10
+
11
+ all = pd.DataFrame()
12
+ for stock in tqdm(stock_list):
13
+ for page in tqdm(range(500)):
14
+ url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}"
15
+ res = requests.get(url)
16
+ res = json.loads(res.text)
17
+ if len(res) == 0:
18
+ break
19
+ else:
20
+ res = pd.DataFrame(res)
21
+ all = pd.concat([all, res])
22
+
23
+ all = all.reset_index(drop=True)
24
+ all.to_csv("dataset_more.csv")
finnlp/data_sources/news/gurufocus_streaming.py ADDED
File without changes
finnlp/data_sources/news/investorplace_streaming.py ADDED
File without changes
finnlp/data_sources/news/marketwatch_streaming.py ADDED
File without changes
finnlp/data_sources/news/pennystocks_streaming.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from lxml import etree
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ import json
6
+ import time as time
7
+ from finnlp.data_sources.news._base import News_Downloader
8
+
9
+ # TODO:
10
+ # 1. More Pages
11
+ # 2. Contents
12
+
13
+ class PennyStocks_Streaming(News_Downloader):
14
+
15
+ def __init__(self, args={}):
16
+ super().__init__(args)
17
+ self.dataframe = pd.DataFrame()
18
+
19
+ def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
20
+ # establish session
21
+ self._connect_session()
22
+
23
+ # download first page
24
+ self._download_first_page(keyword, delay = delay)
25
+
26
+ # download the following pages
27
+ # self._download_other_pages(keyword)
28
+ print("Only support the first page now!")
29
+
30
+
31
+ def _connect_session(self):
32
+ # since the server will check cookies, we need first
33
+ # request the main site withour cookies, then finish
34
+ # searching for the stock information we want.
35
+ self.session = requests.session()
36
+ first_url = "https://pennystocks.com/"
37
+ headers = {
38
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
39
+ }
40
+ print("Requesting https://pennystocks.com ...", end = " ")
41
+ res = self.session.get(headers = headers, url = first_url)
42
+ if res.status_code !=200:
43
+ raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")
44
+
45
+ print("succeed!")
46
+
47
+ def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
48
+ url = f"https://pennystocks.com/?s={keyword}"
49
+ headers = {
50
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
51
+ }
52
+ res = self.session.get(url = url, headers = headers)
53
+ res = etree.HTML(res.text)
54
+ articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
55
+ # not sure why but this really works
56
+
57
+ while max_retry and len(articles) == 0:
58
+ import time
59
+ time.sleep(delay)
60
+ print("Gathering again ..", end = ' ')
61
+ res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
62
+ res = etree.HTML(res.text)
63
+ articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
64
+ max_retry -= 1
65
+ print(f"Remaining Retry: {max_retry}")
66
+
67
+
68
+ for a in articles:
69
+ title = a.xpath("./header/h2/a//text()")[0]
70
+ time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
71
+ brief = a.xpath("./div[3]/div/div/text()")[0]
72
+ reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
73
+ columns = ["title", "time", "brief", "reading_time"]
74
+ tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
75
+ self.dataframe = pd.concat([self.dataframe, tmp])
76
+
77
+
78
+ def _download_other_pages(self, keyword = "apple"):
79
+ pass
80
+
81
+
finnlp/data_sources/news/reuters_streaming.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from lxml import etree
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ import json
6
+ import time
7
+ from finnlp.data_sources.news._base import News_Downloader
8
+
9
+ # TODO:
10
+ # 1. Contents
11
+
12
+
13
+ class Reuters_Streaming(News_Downloader):
14
+
15
+ def __init__(self, args={}):
16
+ super().__init__(args)
17
+ self.dataframe = pd.DataFrame()
18
+
19
+ def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
20
+ news_per_page = 20
21
+ url = "https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2"
22
+ headers = {
23
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
24
+ "Referer": "https://www.reuters.com/site-search/?query=AAPL&sort=newest&offset=0"
25
+ }
26
+
27
+ print( "Geting pages: ", end = "")
28
+ for i in range(rounds):
29
+ offset = i * news_per_page
30
+ params = {
31
+ "query": f'{{"keyword":"{keyword}","offset":{offset},"orderby":"display_date:desc","size":20,"website":"reuters"}}',
32
+ "d": "144",
33
+ "_website": "reuters",
34
+ }
35
+ response = self._request_get(url, headers=headers, params = params)
36
+
37
+ # check connection error
38
+ if response.status_code != 200:
39
+ return "Error"
40
+
41
+ # Phrase response
42
+ response = json.loads(response.text)
43
+
44
+ # check whether return content
45
+ if response["statusCode"] != 200:
46
+ print("Early Stopping")
47
+ break
48
+
49
+ # make pandas DataFrame
50
+ tmp = pd.DataFrame(response["result"]["articles"])
51
+ self.dataframe = pd.concat([self.dataframe, tmp])
52
+
53
+ # finish
54
+ print( i+1, end = " ")
55
+ time.sleep(delay)
finnlp/data_sources/news/seekingalpha_date_range.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+
4
+ import json
5
+ import requests
6
+ import pandas as pd
7
+ from lxml import etree
8
+ from tqdm import tqdm
9
+ from datetime import datetime
10
+
11
+ from finnlp.data_sources.news._base import News_Downloader
12
+
13
+ class SeekingAlpha_Date_Range(News_Downloader):
14
+ def __init__(self, args = {}):
15
+ super().__init__(args)
16
+
17
+ def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
18
+ self.dataframe = pd.DataFrame()
19
+ start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
20
+ end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
21
+ # Downloading First Page
22
+ data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
23
+ self.dataframe = pd.concat([self.dataframe, data])
24
+
25
+ # Downloading Other Pages
26
+ with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
27
+ bar.update(1)
28
+ for page in range(2, totalpages+1):
29
+ data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
30
+ self.dataframe = pd.concat([self.dataframe, data])
31
+ bar.update(1)
32
+ self.dataframe = self.dataframe.reset_index(drop = True)
33
+
34
+ def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
35
+ url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
36
+ headers = {
37
+ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
38
+ 'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
39
+ }
40
+ response = requests.get(url, headers=headers, proxies=proxies)
41
+ if response.status_code != 200:
42
+ print(f"stock: {stock}, page: {page} went wrong!")
43
+ return pd.DataFrame(), 1
44
+ else:
45
+ res = json.loads(response.text)
46
+ data = pd.DataFrame(res["data"])
47
+ # make new features
48
+ new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
49
+ data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
50
+ new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
51
+ data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
52
+
53
+ # total pages
54
+ totalpages = res["meta"]["page"]["totalPages"]
55
+ return data, totalpages
56
+
57
+
finnlp/data_sources/news/sina_finance_date_range.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pytz
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ import numpy as np
7
+ from lxml import etree
8
+ from tqdm import tqdm
9
+ from finnlp.data_sources.news._base import News_Downloader
10
+
11
+ class Sina_Finance_Date_Range(News_Downloader):
12
+
13
+ def __init__(self, args={}):
14
+ super().__init__(args)
15
+ self.dataframe = pd.DataFrame()
16
+
17
+ def download_date_range_all(self, start_date, end_date):
18
+ self.date_list = pd.date_range(start_date, end_date)
19
+ for date in tqdm(self.date_list, desc= "Downloading Titles..."):
20
+ tmp = self._gather_one_day(date)
21
+ self.dataframe = pd.concat([self.dataframe, tmp])
22
+ self.dataframe = self.dataframe.reset_index(drop = True)
23
+
24
+ def _gather_one_day(self, date, delay = 0.1):
25
+ end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp()
26
+ start_timestamp = end_timestamp - 60 * 60 * 24
27
+
28
+ res = pd.DataFrame()
29
+ for page in range(100):
30
+ url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}"
31
+ response = self._request_get(url = url)
32
+ if response is not None:
33
+ response.encoding = 'unicode'
34
+ text = response.text
35
+ text = json.loads(text, strict=True)
36
+ text = text["result"]
37
+ text = text["data"]
38
+ if len(text) == 0:
39
+ break
40
+
41
+ for i in text:
42
+ for ii in i.keys():
43
+ i[ii] = [i[ii]]
44
+ tmp = pd.DataFrame(i)
45
+ res = pd.concat([res, tmp])
46
+ time.sleep(delay)
47
+
48
+ if res.shape[0] != 0:
49
+ res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True)
50
+ res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True)
51
+ res.intime = pd.to_datetime(res.intime, unit="s", utc=True)
52
+
53
+ tz = pytz.timezone("Asia/Shanghai")
54
+ res.ctime = [t.astimezone(tz) for t in res.ctime]
55
+ res.mtime = [t.astimezone(tz) for t in res.mtime]
56
+ res.intime = [t.astimezone(tz) for t in res.intime]
57
+
58
+ return res
59
+
60
+ def gather_content(self, delay = 0.01):
61
+ pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
62
+ self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
63
+
64
+ def _gather_content_apply(self,x, pbar, delay = 0.01):
65
+ url = x.url
66
+ response = self._request_get(url=url)
67
+
68
+ if response is not None:
69
+ # process
70
+ response.encoding = 'unicode'
71
+ text = response.text
72
+ page = etree.HTML(text)
73
+ page = page.xpath("//*[@id='artibody']/p")
74
+ page = [p.xpath(".//text()") for p in page]
75
+ page = [''.join(p) for p in page]
76
+ content = "\n".join(page)
77
+ content = content.replace("\u3000","")
78
+ else:
79
+ content = np.nan
80
+
81
+ # update
82
+ pbar.update(1)
83
+ time.sleep(delay)
84
+
85
+ return content
86
+
finnlp/data_sources/news/talkmarkets_streaming.py ADDED
File without changes
finnlp/data_sources/news/thefly_streaming.py ADDED
File without changes
finnlp/data_sources/news/tipranks_streaming.py ADDED
File without changes
finnlp/data_sources/news/tushare_major_news.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tushare as ts
2
+ import pandas as pd
3
+ from tqdm.notebook import tqdm
4
+ from finnlp.data_sources.news._base import News_Downloader
5
+ import time
6
+
7
+ class Tushare_Major_News(News_Downloader):
8
+
9
+ def __init__(self, args = {}):
10
+ token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5"
11
+ ts.set_token(token)
12
+ self.pro = ts.pro_api()
13
+
14
+ def download_news(self, start_date, end_date, stock = "all"):
15
+ self.date_list = pd.date_range(start_date,end_date)
16
+ res = pd.DataFrame()
17
+ for date in tqdm(self.date_list):
18
+ tmp = self.gather_one_day_news(date)
19
+ res = pd.concat([res,tmp])
20
+ self.dataframe = res
21
+
22
+ def gather_one_day_news(self,date,stock = "all",delay = 0.1):
23
+ date = self.transfer_standard_date_to_nonstandard(date)
24
+ res = self.pro.major_news(start_date = date,end_date = date)
25
+ time.sleep(delay)
26
+ return res
27
+
28
+ def clean_data(self):
29
+ pass
30
+
31
+ def transfer_standard_date_to_nonstandard(self,date):
32
+ return date.strftime("%Y-%m0%d 00:00:00")
finnlp/data_sources/news/yahoo_streaming.py ADDED
File without changes
finnlp/data_sources/news/yicai_streaming.py ADDED
File without changes