Spaces:
Runtime error
Runtime error
Commit
·
9014990
1
Parent(s):
2f8bae4
Upload 23 files
Browse files- finnlp/data_sources/news/__pycache__/__init__.cpython-310.pyc +0 -0
- finnlp/data_sources/news/__pycache__/_base.cpython-310.pyc +0 -0
- finnlp/data_sources/news/__pycache__/finnhub_date_range.cpython-310.pyc +0 -0
- finnlp/data_sources/news/_base.py +22 -0
- finnlp/data_sources/news/akshare_cctv.py +29 -0
- finnlp/data_sources/news/alliancenews_streaming.py +0 -0
- finnlp/data_sources/news/cnbc_streaming.py +0 -0
- finnlp/data_sources/news/eastmoney_streaming.py +69 -0
- finnlp/data_sources/news/finnhub_date_range.py +222 -0
- finnlp/data_sources/news/fmp_streaming.py +24 -0
- finnlp/data_sources/news/gurufocus_streaming.py +0 -0
- finnlp/data_sources/news/investorplace_streaming.py +0 -0
- finnlp/data_sources/news/marketwatch_streaming.py +0 -0
- finnlp/data_sources/news/pennystocks_streaming.py +81 -0
- finnlp/data_sources/news/reuters_streaming.py +55 -0
- finnlp/data_sources/news/seekingalpha_date_range.py +57 -0
- finnlp/data_sources/news/sina_finance_date_range.py +86 -0
- finnlp/data_sources/news/talkmarkets_streaming.py +0 -0
- finnlp/data_sources/news/thefly_streaming.py +0 -0
- finnlp/data_sources/news/tipranks_streaming.py +0 -0
- finnlp/data_sources/news/tushare_major_news.py +32 -0
- finnlp/data_sources/news/yahoo_streaming.py +0 -0
- finnlp/data_sources/news/yicai_streaming.py +0 -0
finnlp/data_sources/news/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (174 Bytes). View file
|
|
|
finnlp/data_sources/news/__pycache__/_base.cpython-310.pyc
ADDED
|
Binary file (1.36 kB). View file
|
|
|
finnlp/data_sources/news/__pycache__/finnhub_date_range.cpython-310.pyc
ADDED
|
Binary file (5.66 kB). View file
|
|
|
finnlp/data_sources/news/_base.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from finnlp.data_sources._base import FinNLP_Downloader
|
| 2 |
+
|
| 3 |
+
class News_Downloader(FinNLP_Downloader):
|
| 4 |
+
|
| 5 |
+
def __init__(self, args = {}):
|
| 6 |
+
super().__init__(args)
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
def download_date_range(self, start_date, end_date, stock = None):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
def download_streaming(self, stock = None):
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def clean_data(self):
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
def _gather_one_part(self, date, stock = None, delay = 0.1):
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
def _gather_content(self):
|
| 22 |
+
pass
|
finnlp/data_sources/news/akshare_cctv.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import akshare as ak
|
| 3 |
+
from tqdm.notebook import tqdm
|
| 4 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Akshare_cctv(News_Downloader):
|
| 8 |
+
|
| 9 |
+
def __init__(self, args={}):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
def download_news(self, start_date, end_date, stock="all"):
|
| 13 |
+
self.date_list = pd.date_range(start_date, end_date)
|
| 14 |
+
res = pd.DataFrame()
|
| 15 |
+
for date in tqdm(self.date_list):
|
| 16 |
+
tmp = self.gather_one_day_news(date)
|
| 17 |
+
res = pd.concat([res, tmp])
|
| 18 |
+
self.dataframe = res
|
| 19 |
+
|
| 20 |
+
def clean_data(self):
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
def gather_one_day_news(self, date, stock="all", delay=0.1):
|
| 24 |
+
date = self.transfer_standard_date_to_nonstandard(date)
|
| 25 |
+
res = ak.news_cctv(date=date)
|
| 26 |
+
return res
|
| 27 |
+
|
| 28 |
+
def transfer_standard_date_to_nonstandard(self, date):
|
| 29 |
+
return date.strftime("%Y%m%d")
|
finnlp/data_sources/news/alliancenews_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/cnbc_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/eastmoney_streaming.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from lxml import etree
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Eastmoney_Streaming(News_Downloader):
|
| 9 |
+
|
| 10 |
+
def __init__(self, args={}):
|
| 11 |
+
super().__init__(args)
|
| 12 |
+
self.dataframe = pd.DataFrame()
|
| 13 |
+
|
| 14 |
+
def download_streaming_stock(self, stock = "600519", rounds = 3):
|
| 15 |
+
print( "Geting pages: ", end = "")
|
| 16 |
+
if rounds > 0:
|
| 17 |
+
for r in range(rounds):
|
| 18 |
+
br = self._gather_pages(stock, r)
|
| 19 |
+
if br == "break":
|
| 20 |
+
break
|
| 21 |
+
else:
|
| 22 |
+
r = 1
|
| 23 |
+
error_count = 0
|
| 24 |
+
while 1:
|
| 25 |
+
br = self._gather_pages(stock, r)
|
| 26 |
+
if br == "break":
|
| 27 |
+
break
|
| 28 |
+
elif br == "Error":
|
| 29 |
+
error_count +=1
|
| 30 |
+
if error_count>10:
|
| 31 |
+
print("Connection Error")
|
| 32 |
+
r += 1
|
| 33 |
+
print( f"Get total {r+1} pages.")
|
| 34 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
| 35 |
+
|
| 36 |
+
def _gather_pages(self, stock, page):
|
| 37 |
+
print( page, end = " ")
|
| 38 |
+
url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
|
| 39 |
+
headers = {
|
| 40 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
requests.DEFAULT_RETRIES = 5 # 增加重试连接次数
|
| 44 |
+
s = requests.session()
|
| 45 |
+
s.keep_alive = False # 关闭多余连接
|
| 46 |
+
|
| 47 |
+
response = self._request_get(url, headers=headers)
|
| 48 |
+
if response.status_code != 200:
|
| 49 |
+
return "Error"
|
| 50 |
+
|
| 51 |
+
# gather the comtent of the first page
|
| 52 |
+
page = etree.HTML(response.text)
|
| 53 |
+
trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
|
| 54 |
+
have_one = False
|
| 55 |
+
for item in trs:
|
| 56 |
+
have_one = True
|
| 57 |
+
read_amount = item.xpath("./td[1]//text()")[0]
|
| 58 |
+
comments = item.xpath("./td[2]//text()")[0]
|
| 59 |
+
title = item.xpath("./td[3]/div/a//text()")[0]
|
| 60 |
+
content_link = item.xpath("./td[3]/div/a/@href")[0]
|
| 61 |
+
author = item.xpath("./td[4]//text()")[0]
|
| 62 |
+
time = item.xpath("./td[5]//text()")[0]
|
| 63 |
+
tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
|
| 64 |
+
columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
|
| 65 |
+
tmp.columns = columns
|
| 66 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
| 67 |
+
#print(title)
|
| 68 |
+
if have_one == False:
|
| 69 |
+
return "break"
|
finnlp/data_sources/news/finnhub_date_range.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
warnings.filterwarnings("ignore")
|
| 3 |
+
|
| 4 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 5 |
+
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
from lxml import etree
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import requests
|
| 10 |
+
import finnhub
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
class Finnhub_Date_Range(News_Downloader):
|
| 15 |
+
def __init__(self, args = {}):
|
| 16 |
+
super().__init__(args)
|
| 17 |
+
assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
|
| 18 |
+
self.finnhub_client = finnhub.Client(api_key=args["token"])
|
| 19 |
+
|
| 20 |
+
def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
|
| 21 |
+
self.date_list = pd.date_range(start_date,end_date)
|
| 22 |
+
self.dataframe = pd.DataFrame()
|
| 23 |
+
|
| 24 |
+
days_each_time = 4
|
| 25 |
+
date_list = self.date_list
|
| 26 |
+
# cal total lenth
|
| 27 |
+
if len(date_list)%days_each_time == 0:
|
| 28 |
+
total = len(date_list)//days_each_time
|
| 29 |
+
else:
|
| 30 |
+
total = len(date_list)//days_each_time+1
|
| 31 |
+
|
| 32 |
+
with tqdm(total=total, desc= "Downloading Titles") as bar:
|
| 33 |
+
while len(date_list):
|
| 34 |
+
tmp_date_list = date_list[:days_each_time]
|
| 35 |
+
date_list = date_list[days_each_time:]
|
| 36 |
+
tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
|
| 37 |
+
tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
|
| 38 |
+
res = self._gather_one_part(tmp_start_date,tmp_end_date,stock = stock )
|
| 39 |
+
self.dataframe = pd.concat([self.dataframe,res])
|
| 40 |
+
bar.update(1)
|
| 41 |
+
|
| 42 |
+
# res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
|
| 43 |
+
self.dataframe.datetime = pd.to_datetime(self.dataframe.datetime,unit = "s")
|
| 44 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
| 45 |
+
|
| 46 |
+
def _gather_one_part(self, start_date, end_date, stock = "AAPL", delay = 1):
|
| 47 |
+
res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
|
| 48 |
+
time.sleep(delay)
|
| 49 |
+
return pd.DataFrame(res)
|
| 50 |
+
|
| 51 |
+
def gather_content(self, delay = 0.01):
|
| 52 |
+
pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
|
| 53 |
+
self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
|
| 54 |
+
|
| 55 |
+
def _gather_content_apply(self,x, pbar, delay = 0.01):
|
| 56 |
+
time.sleep(delay)
|
| 57 |
+
url = x.url
|
| 58 |
+
source = x.source
|
| 59 |
+
response = self._request_get(url = url)
|
| 60 |
+
# response = self._request_get(url= url, headers= headers)
|
| 61 |
+
pbar.update(1)
|
| 62 |
+
if response is None:
|
| 63 |
+
return "Connection Error"
|
| 64 |
+
else:
|
| 65 |
+
page = etree.HTML(response.text)
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
# Yahoo Finance
|
| 69 |
+
if source == "Yahoo":
|
| 70 |
+
page = page.xpath("/html/body/div[3]/div[1]/div/main/div[1]/div/div/div/div/article/div/div/div/div/div/div[2]/div[4]")
|
| 71 |
+
content = page[0].xpath(".//text()")
|
| 72 |
+
content = "\n".join(content)
|
| 73 |
+
return content
|
| 74 |
+
|
| 75 |
+
# Reuters
|
| 76 |
+
elif source == "Reuters":
|
| 77 |
+
page = page.xpath("/html/body/div[1]/div[3]/div/main/article/div[1]/div[2]/div/div/div[2]")
|
| 78 |
+
content = page[0].xpath(".//text()")
|
| 79 |
+
content = "\n".join(content)
|
| 80 |
+
return content
|
| 81 |
+
|
| 82 |
+
# SeekingAlpha
|
| 83 |
+
elif source == "SeekingAlpha":
|
| 84 |
+
page = page.xpath("/html/body/div[2]/div/div[1]/main/div/div[2]/div/article/div/div/div[2]/div/section[1]/div/div/div")
|
| 85 |
+
content = page[0].xpath(".//text()")
|
| 86 |
+
content = "\n".join(content)
|
| 87 |
+
return content
|
| 88 |
+
|
| 89 |
+
# PennyStocks
|
| 90 |
+
elif source == "PennyStocks":
|
| 91 |
+
page = page.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/article/div[2]/div[2]/div")
|
| 92 |
+
content = page[0].xpath(".//text()")
|
| 93 |
+
content = "\n".join(content)
|
| 94 |
+
return content
|
| 95 |
+
|
| 96 |
+
# MarketWatch
|
| 97 |
+
elif source == "MarketWatch":
|
| 98 |
+
page = page.xpath('//*[@id="js-article__body"]')
|
| 99 |
+
content = page[0].xpath(".//text()")
|
| 100 |
+
content = "".join(content)
|
| 101 |
+
while " " in content:
|
| 102 |
+
content = content.replace(" ", " ")
|
| 103 |
+
while "\n \n"in content:
|
| 104 |
+
content = content.replace("\n \n", " ")
|
| 105 |
+
while "\n "in content:
|
| 106 |
+
content = content.replace("\n ", " ")
|
| 107 |
+
return content
|
| 108 |
+
|
| 109 |
+
# Seeking Alpha
|
| 110 |
+
elif source == "Seeking Alpha":
|
| 111 |
+
# first get Seeking Alpha URL
|
| 112 |
+
page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
|
| 113 |
+
url_new = page[0]
|
| 114 |
+
response = self._request_get(url= url_new)
|
| 115 |
+
if response is None:
|
| 116 |
+
return "Connection Error"
|
| 117 |
+
else:
|
| 118 |
+
page = etree.HTML(response.text)
|
| 119 |
+
|
| 120 |
+
content = page[0].xpath(".//text()")
|
| 121 |
+
content = "\n".join(content)
|
| 122 |
+
return content
|
| 123 |
+
|
| 124 |
+
# Alliance News
|
| 125 |
+
elif source == "Alliance News":
|
| 126 |
+
page = page.xpath('//*[@id="comtext"]')
|
| 127 |
+
content = page[0].xpath(".//text()")
|
| 128 |
+
content = [c for c in content if not str(c).startswith("\r\n")]
|
| 129 |
+
content = "\n".join(content)
|
| 130 |
+
return content
|
| 131 |
+
|
| 132 |
+
# Thefly.com
|
| 133 |
+
elif source == "Thefly.com":
|
| 134 |
+
page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
|
| 135 |
+
url_new = page[0]
|
| 136 |
+
response = self._request_get(url= url_new, verify= False)
|
| 137 |
+
if response is None:
|
| 138 |
+
return "Connection Error"
|
| 139 |
+
else:
|
| 140 |
+
page = etree.HTML(response.text)
|
| 141 |
+
|
| 142 |
+
page = page.xpath('/html/body/div[2]/div/div/div/div/div[2]/div[2]//text()')
|
| 143 |
+
# content = page[0].xpath(".//text()")
|
| 144 |
+
# content = [c for c in content if not str(c).startswith("\r\n")]
|
| 145 |
+
content = "\n".join(page)
|
| 146 |
+
content = content.replace("\r\n","")
|
| 147 |
+
|
| 148 |
+
return content
|
| 149 |
+
|
| 150 |
+
# TalkMarkets
|
| 151 |
+
elif source == "TalkMarkets":
|
| 152 |
+
return "Not supported yet"
|
| 153 |
+
|
| 154 |
+
# CNBC
|
| 155 |
+
elif source == "CNBC":
|
| 156 |
+
page = page.xpath('/html/body/div[3]/div/div[1]/div[3]/div/div/div/div[3]/div[1]/div[2]/div[3]//text()')
|
| 157 |
+
content = "\n".join(page)
|
| 158 |
+
|
| 159 |
+
return content
|
| 160 |
+
|
| 161 |
+
# GuruFocus
|
| 162 |
+
elif source == "GuruFocus":
|
| 163 |
+
page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
|
| 164 |
+
url_new = page[0]
|
| 165 |
+
response = self._request_get(url= url_new)
|
| 166 |
+
if response is None:
|
| 167 |
+
return "Connection Error"
|
| 168 |
+
else:
|
| 169 |
+
page = etree.HTML(response.text)
|
| 170 |
+
|
| 171 |
+
page = page.xpath('/html/body/div[1]/div/section/section/main/section/main/div[1]/div/div/div[1]/div[2]/div//text()')
|
| 172 |
+
page_new = []
|
| 173 |
+
for c in page:
|
| 174 |
+
while "\n" in c:
|
| 175 |
+
c = c.replace("\n","")
|
| 176 |
+
while " "in c:
|
| 177 |
+
c = c.replace(" ","")
|
| 178 |
+
|
| 179 |
+
page_new.append(c)
|
| 180 |
+
|
| 181 |
+
content = "\n".join(page_new)
|
| 182 |
+
|
| 183 |
+
return content
|
| 184 |
+
|
| 185 |
+
# InvestorPlace
|
| 186 |
+
elif source == "InvestorPlace":
|
| 187 |
+
page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
|
| 188 |
+
url_new = page[0]
|
| 189 |
+
response = self._request_get(url= url_new)
|
| 190 |
+
if response is None:
|
| 191 |
+
return "Connection Error"
|
| 192 |
+
else:
|
| 193 |
+
page = etree.HTML(response.text)
|
| 194 |
+
page = page.xpath('//script[@type="application/ld+json"]')[1]
|
| 195 |
+
content = page.xpath(".//text()")
|
| 196 |
+
content = json.loads(content[0])
|
| 197 |
+
content = content["articleBody"]
|
| 198 |
+
|
| 199 |
+
return content
|
| 200 |
+
|
| 201 |
+
# TipRanks
|
| 202 |
+
elif source == "TipRanks":
|
| 203 |
+
page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
|
| 204 |
+
url_new = page[0]
|
| 205 |
+
response = self._request_get(url= url_new)
|
| 206 |
+
if response is None:
|
| 207 |
+
return "Connection Error"
|
| 208 |
+
else:
|
| 209 |
+
page = etree.HTML(response.text)
|
| 210 |
+
# /html/body/div[1]/div[2]/div[5]/div[2]/div[2]/div/div[6]/div/article/p[1]/p
|
| 211 |
+
page = page.xpath('/html/body/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/div[6]//text()')
|
| 212 |
+
# content = page[0].xpath('.//text()')
|
| 213 |
+
page = [p.replace("\n","") for p in page]
|
| 214 |
+
content = "".join(page)
|
| 215 |
+
return content
|
| 216 |
+
|
| 217 |
+
else:
|
| 218 |
+
return "Not supported yet"
|
| 219 |
+
|
| 220 |
+
except:
|
| 221 |
+
return "Error"
|
| 222 |
+
|
finnlp/data_sources/news/fmp_streaming.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from tqdm.notebook import tqdm
|
| 5 |
+
|
| 6 |
+
df = pd.read_csv("NAS.csv", index_col=0)
|
| 7 |
+
stock_list = df.index.to_list()
|
| 8 |
+
|
| 9 |
+
api_key = YOUR_API_KEY # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys
|
| 10 |
+
|
| 11 |
+
all = pd.DataFrame()
|
| 12 |
+
for stock in tqdm(stock_list):
|
| 13 |
+
for page in tqdm(range(500)):
|
| 14 |
+
url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}"
|
| 15 |
+
res = requests.get(url)
|
| 16 |
+
res = json.loads(res.text)
|
| 17 |
+
if len(res) == 0:
|
| 18 |
+
break
|
| 19 |
+
else:
|
| 20 |
+
res = pd.DataFrame(res)
|
| 21 |
+
all = pd.concat([all, res])
|
| 22 |
+
|
| 23 |
+
all = all.reset_index(drop=True)
|
| 24 |
+
all.to_csv("dataset_more.csv")
|
finnlp/data_sources/news/gurufocus_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/investorplace_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/marketwatch_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/pennystocks_streaming.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from lxml import etree
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
import time as time
|
| 7 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 8 |
+
|
| 9 |
+
# TODO:
|
| 10 |
+
# 1. More Pages
|
| 11 |
+
# 2. Contents
|
| 12 |
+
|
| 13 |
+
class PennyStocks_Streaming(News_Downloader):
|
| 14 |
+
|
| 15 |
+
def __init__(self, args={}):
|
| 16 |
+
super().__init__(args)
|
| 17 |
+
self.dataframe = pd.DataFrame()
|
| 18 |
+
|
| 19 |
+
def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
|
| 20 |
+
# establish session
|
| 21 |
+
self._connect_session()
|
| 22 |
+
|
| 23 |
+
# download first page
|
| 24 |
+
self._download_first_page(keyword, delay = delay)
|
| 25 |
+
|
| 26 |
+
# download the following pages
|
| 27 |
+
# self._download_other_pages(keyword)
|
| 28 |
+
print("Only support the first page now!")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _connect_session(self):
|
| 32 |
+
# since the server will check cookies, we need first
|
| 33 |
+
# request the main site withour cookies, then finish
|
| 34 |
+
# searching for the stock information we want.
|
| 35 |
+
self.session = requests.session()
|
| 36 |
+
first_url = "https://pennystocks.com/"
|
| 37 |
+
headers = {
|
| 38 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
| 39 |
+
}
|
| 40 |
+
print("Requesting https://pennystocks.com ...", end = " ")
|
| 41 |
+
res = self.session.get(headers = headers, url = first_url)
|
| 42 |
+
if res.status_code !=200:
|
| 43 |
+
raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")
|
| 44 |
+
|
| 45 |
+
print("succeed!")
|
| 46 |
+
|
| 47 |
+
def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
|
| 48 |
+
url = f"https://pennystocks.com/?s={keyword}"
|
| 49 |
+
headers = {
|
| 50 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
| 51 |
+
}
|
| 52 |
+
res = self.session.get(url = url, headers = headers)
|
| 53 |
+
res = etree.HTML(res.text)
|
| 54 |
+
articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
|
| 55 |
+
# not sure why but this really works
|
| 56 |
+
|
| 57 |
+
while max_retry and len(articles) == 0:
|
| 58 |
+
import time
|
| 59 |
+
time.sleep(delay)
|
| 60 |
+
print("Gathering again ..", end = ' ')
|
| 61 |
+
res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
|
| 62 |
+
res = etree.HTML(res.text)
|
| 63 |
+
articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
|
| 64 |
+
max_retry -= 1
|
| 65 |
+
print(f"Remaining Retry: {max_retry}")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
for a in articles:
|
| 69 |
+
title = a.xpath("./header/h2/a//text()")[0]
|
| 70 |
+
time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
|
| 71 |
+
brief = a.xpath("./div[3]/div/div/text()")[0]
|
| 72 |
+
reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
|
| 73 |
+
columns = ["title", "time", "brief", "reading_time"]
|
| 74 |
+
tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
|
| 75 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _download_other_pages(self, keyword = "apple"):
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
|
finnlp/data_sources/news/reuters_streaming.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from lxml import etree
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 8 |
+
|
| 9 |
+
# TODO:
|
| 10 |
+
# 1. Contents
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Reuters_Streaming(News_Downloader):
|
| 14 |
+
|
| 15 |
+
def __init__(self, args={}):
|
| 16 |
+
super().__init__(args)
|
| 17 |
+
self.dataframe = pd.DataFrame()
|
| 18 |
+
|
| 19 |
+
def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
|
| 20 |
+
news_per_page = 20
|
| 21 |
+
url = "https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2"
|
| 22 |
+
headers = {
|
| 23 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
| 24 |
+
"Referer": "https://www.reuters.com/site-search/?query=AAPL&sort=newest&offset=0"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
print( "Geting pages: ", end = "")
|
| 28 |
+
for i in range(rounds):
|
| 29 |
+
offset = i * news_per_page
|
| 30 |
+
params = {
|
| 31 |
+
"query": f'{{"keyword":"{keyword}","offset":{offset},"orderby":"display_date:desc","size":20,"website":"reuters"}}',
|
| 32 |
+
"d": "144",
|
| 33 |
+
"_website": "reuters",
|
| 34 |
+
}
|
| 35 |
+
response = self._request_get(url, headers=headers, params = params)
|
| 36 |
+
|
| 37 |
+
# check connection error
|
| 38 |
+
if response.status_code != 200:
|
| 39 |
+
return "Error"
|
| 40 |
+
|
| 41 |
+
# Phrase response
|
| 42 |
+
response = json.loads(response.text)
|
| 43 |
+
|
| 44 |
+
# check whether return content
|
| 45 |
+
if response["statusCode"] != 200:
|
| 46 |
+
print("Early Stopping")
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
# make pandas DataFrame
|
| 50 |
+
tmp = pd.DataFrame(response["result"]["articles"])
|
| 51 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
| 52 |
+
|
| 53 |
+
# finish
|
| 54 |
+
print( i+1, end = " ")
|
| 55 |
+
time.sleep(delay)
|
finnlp/data_sources/news/seekingalpha_date_range.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
warnings.filterwarnings("ignore")
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import requests
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from lxml import etree
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 12 |
+
|
| 13 |
+
class SeekingAlpha_Date_Range(News_Downloader):
|
| 14 |
+
def __init__(self, args = {}):
|
| 15 |
+
super().__init__(args)
|
| 16 |
+
|
| 17 |
+
def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
|
| 18 |
+
self.dataframe = pd.DataFrame()
|
| 19 |
+
start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
|
| 20 |
+
end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
|
| 21 |
+
# Downloading First Page
|
| 22 |
+
data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
|
| 23 |
+
self.dataframe = pd.concat([self.dataframe, data])
|
| 24 |
+
|
| 25 |
+
# Downloading Other Pages
|
| 26 |
+
with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
|
| 27 |
+
bar.update(1)
|
| 28 |
+
for page in range(2, totalpages+1):
|
| 29 |
+
data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
|
| 30 |
+
self.dataframe = pd.concat([self.dataframe, data])
|
| 31 |
+
bar.update(1)
|
| 32 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
| 33 |
+
|
| 34 |
+
def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
|
| 35 |
+
url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
|
| 36 |
+
headers = {
|
| 37 |
+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
|
| 38 |
+
'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
|
| 39 |
+
}
|
| 40 |
+
response = requests.get(url, headers=headers, proxies=proxies)
|
| 41 |
+
if response.status_code != 200:
|
| 42 |
+
print(f"stock: {stock}, page: {page} went wrong!")
|
| 43 |
+
return pd.DataFrame(), 1
|
| 44 |
+
else:
|
| 45 |
+
res = json.loads(response.text)
|
| 46 |
+
data = pd.DataFrame(res["data"])
|
| 47 |
+
# make new features
|
| 48 |
+
new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
|
| 49 |
+
data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
|
| 50 |
+
new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
|
| 51 |
+
data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
|
| 52 |
+
|
| 53 |
+
# total pages
|
| 54 |
+
totalpages = res["meta"]["page"]["totalPages"]
|
| 55 |
+
return data, totalpages
|
| 56 |
+
|
| 57 |
+
|
finnlp/data_sources/news/sina_finance_date_range.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pytz
|
| 3 |
+
import time
|
| 4 |
+
import requests
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from lxml import etree
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 10 |
+
|
| 11 |
+
class Sina_Finance_Date_Range(News_Downloader):
|
| 12 |
+
|
| 13 |
+
def __init__(self, args={}):
|
| 14 |
+
super().__init__(args)
|
| 15 |
+
self.dataframe = pd.DataFrame()
|
| 16 |
+
|
| 17 |
+
def download_date_range_all(self, start_date, end_date):
|
| 18 |
+
self.date_list = pd.date_range(start_date, end_date)
|
| 19 |
+
for date in tqdm(self.date_list, desc= "Downloading Titles..."):
|
| 20 |
+
tmp = self._gather_one_day(date)
|
| 21 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
| 22 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
| 23 |
+
|
| 24 |
+
def _gather_one_day(self, date, delay = 0.1):
|
| 25 |
+
end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp()
|
| 26 |
+
start_timestamp = end_timestamp - 60 * 60 * 24
|
| 27 |
+
|
| 28 |
+
res = pd.DataFrame()
|
| 29 |
+
for page in range(100):
|
| 30 |
+
url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}"
|
| 31 |
+
response = self._request_get(url = url)
|
| 32 |
+
if response is not None:
|
| 33 |
+
response.encoding = 'unicode'
|
| 34 |
+
text = response.text
|
| 35 |
+
text = json.loads(text, strict=True)
|
| 36 |
+
text = text["result"]
|
| 37 |
+
text = text["data"]
|
| 38 |
+
if len(text) == 0:
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
for i in text:
|
| 42 |
+
for ii in i.keys():
|
| 43 |
+
i[ii] = [i[ii]]
|
| 44 |
+
tmp = pd.DataFrame(i)
|
| 45 |
+
res = pd.concat([res, tmp])
|
| 46 |
+
time.sleep(delay)
|
| 47 |
+
|
| 48 |
+
if res.shape[0] != 0:
|
| 49 |
+
res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True)
|
| 50 |
+
res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True)
|
| 51 |
+
res.intime = pd.to_datetime(res.intime, unit="s", utc=True)
|
| 52 |
+
|
| 53 |
+
tz = pytz.timezone("Asia/Shanghai")
|
| 54 |
+
res.ctime = [t.astimezone(tz) for t in res.ctime]
|
| 55 |
+
res.mtime = [t.astimezone(tz) for t in res.mtime]
|
| 56 |
+
res.intime = [t.astimezone(tz) for t in res.intime]
|
| 57 |
+
|
| 58 |
+
return res
|
| 59 |
+
|
| 60 |
+
def gather_content(self, delay = 0.01):
|
| 61 |
+
pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
|
| 62 |
+
self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
|
| 63 |
+
|
| 64 |
+
def _gather_content_apply(self,x, pbar, delay = 0.01):
|
| 65 |
+
url = x.url
|
| 66 |
+
response = self._request_get(url=url)
|
| 67 |
+
|
| 68 |
+
if response is not None:
|
| 69 |
+
# process
|
| 70 |
+
response.encoding = 'unicode'
|
| 71 |
+
text = response.text
|
| 72 |
+
page = etree.HTML(text)
|
| 73 |
+
page = page.xpath("//*[@id='artibody']/p")
|
| 74 |
+
page = [p.xpath(".//text()") for p in page]
|
| 75 |
+
page = [''.join(p) for p in page]
|
| 76 |
+
content = "\n".join(page)
|
| 77 |
+
content = content.replace("\u3000","")
|
| 78 |
+
else:
|
| 79 |
+
content = np.nan
|
| 80 |
+
|
| 81 |
+
# update
|
| 82 |
+
pbar.update(1)
|
| 83 |
+
time.sleep(delay)
|
| 84 |
+
|
| 85 |
+
return content
|
| 86 |
+
|
finnlp/data_sources/news/talkmarkets_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/thefly_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/tipranks_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/tushare_major_news.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tushare as ts
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from tqdm.notebook import tqdm
|
| 4 |
+
from finnlp.data_sources.news._base import News_Downloader
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
class Tushare_Major_News(News_Downloader):
|
| 8 |
+
|
| 9 |
+
def __init__(self, args = {}):
|
| 10 |
+
token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5"
|
| 11 |
+
ts.set_token(token)
|
| 12 |
+
self.pro = ts.pro_api()
|
| 13 |
+
|
| 14 |
+
def download_news(self, start_date, end_date, stock = "all"):
|
| 15 |
+
self.date_list = pd.date_range(start_date,end_date)
|
| 16 |
+
res = pd.DataFrame()
|
| 17 |
+
for date in tqdm(self.date_list):
|
| 18 |
+
tmp = self.gather_one_day_news(date)
|
| 19 |
+
res = pd.concat([res,tmp])
|
| 20 |
+
self.dataframe = res
|
| 21 |
+
|
| 22 |
+
def gather_one_day_news(self,date,stock = "all",delay = 0.1):
|
| 23 |
+
date = self.transfer_standard_date_to_nonstandard(date)
|
| 24 |
+
res = self.pro.major_news(start_date = date,end_date = date)
|
| 25 |
+
time.sleep(delay)
|
| 26 |
+
return res
|
| 27 |
+
|
| 28 |
+
def clean_data(self):
|
| 29 |
+
pass
|
| 30 |
+
|
| 31 |
+
def transfer_standard_date_to_nonstandard(self,date):
|
| 32 |
+
return date.strftime("%Y-%m0%d 00:00:00")
|
finnlp/data_sources/news/yahoo_streaming.py
ADDED
|
File without changes
|
finnlp/data_sources/news/yicai_streaming.py
ADDED
|
File without changes
|