Spaces:
Runtime error
Runtime error
Commit
·
9572c06
1
Parent(s):
74231b9
Upload 5 files
Browse files
finnlp/data_sources/company_announcement/_base.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources._base import FinNLP_Downloader
|
2 |
+
|
3 |
+
class Company_Announcement_Downloader(FinNLP_Downloader):
|
4 |
+
|
5 |
+
def __init__(self, args = {}):
|
6 |
+
super().__init__(args)
|
7 |
+
pass
|
8 |
+
|
9 |
+
def download_date_range_all(self, start_date, end_date):
|
10 |
+
pass
|
11 |
+
|
12 |
+
def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
|
13 |
+
pass
|
14 |
+
|
15 |
+
def download_streaming_all(self, rounds = 3):
|
16 |
+
pass
|
17 |
+
|
18 |
+
def download_streaming_stock(self, stock = None, rounds = 3):
|
19 |
+
pass
|
20 |
+
|
21 |
+
def clean_data(self):
|
22 |
+
pass
|
finnlp/data_sources/company_announcement/juchao.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import time
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import pandas as pd
|
8 |
+
from tqdm import tqdm
|
9 |
+
from PyPDF2 import PdfReader
|
10 |
+
|
11 |
+
class Juchao_Annoumcement(Company_Announcement_Downloader):
|
12 |
+
|
13 |
+
def __init__(self, args = {}):
|
14 |
+
super().__init__(args)
|
15 |
+
self.dataframe = pd.DataFrame()
|
16 |
+
|
17 |
+
def download_date_range_stock(self,start_date, end_date, stock = "000001",max_page = 100, searchkey= "", get_content = False, save_dir = "./tmp/" , delate_pdf = False):
|
18 |
+
self.org_dict = self._get_orgid()
|
19 |
+
|
20 |
+
# download the first page
|
21 |
+
res = self._get_open_page(start_date, end_date, stock, 1, searchkey)
|
22 |
+
total_pages = res["totalpages"]+1
|
23 |
+
|
24 |
+
if res["announcements"] is None:
|
25 |
+
print(f"Nothing related to your searchkey({searchkey}) is found, you may try another one or just leave it blank")
|
26 |
+
else:
|
27 |
+
tmp_df = self._process_data(res)
|
28 |
+
self.dataframe = pd.concat([self.dataframe, tmp_df])
|
29 |
+
|
30 |
+
page = 2
|
31 |
+
# download other page
|
32 |
+
pbar = tqdm(total=total_pages,desc="Downloading by page...")
|
33 |
+
|
34 |
+
for _ in range(max_page):
|
35 |
+
res = self._get_open_page(start_date, end_date, stock, page, searchkey)
|
36 |
+
if res["announcements"] is None:
|
37 |
+
break
|
38 |
+
tmp_df = self._process_data(res)
|
39 |
+
self.dataframe = pd.concat([self.dataframe, tmp_df])
|
40 |
+
pbar.update(1)
|
41 |
+
page += 1
|
42 |
+
pbar.update(1)
|
43 |
+
# Convert Time
|
44 |
+
self.dataframe.announcementTime = self.dataframe.announcementTime.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000)))
|
45 |
+
self.dataframe.announcementTime = pd.to_datetime(self.dataframe.announcementTime)
|
46 |
+
|
47 |
+
if get_content:
|
48 |
+
pbar = tqdm(total=self.dataframe.shape[0], desc="Getting the text data...")
|
49 |
+
self.dataframe[["PDF_path","Content"]] = self.dataframe.apply(lambda x: self._get_pdfs(x,save_dir, delate_pdf, pbar),axis= 1,result_type = "expand")
|
50 |
+
if delate_pdf:
|
51 |
+
os.removedirs(save_dir)
|
52 |
+
|
53 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
54 |
+
|
55 |
+
def _get_open_page(self,start_date,end_date, stock,page, searchkey):
|
56 |
+
url = "http://www.cninfo.com.cn/new/hisAnnouncement/query?"
|
57 |
+
headers = {
|
58 |
+
"Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index",
|
59 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
|
60 |
+
}
|
61 |
+
data = {
|
62 |
+
"pageNum": page,
|
63 |
+
"pageSize": "30",
|
64 |
+
"column": "szse",
|
65 |
+
"tabName": "fulltext",
|
66 |
+
"plate":"",
|
67 |
+
"stock":stock + "," + self.org_dict[stock] ,
|
68 |
+
"searchkey": searchkey,
|
69 |
+
"secid":"",
|
70 |
+
"category":"",
|
71 |
+
"trade":"",
|
72 |
+
"seDate": f"{start_date}~{end_date}",
|
73 |
+
"sortName": "",
|
74 |
+
"sortType": "",
|
75 |
+
"isHLtitle": "true",
|
76 |
+
}
|
77 |
+
res = requests.post(url = url, headers = headers, data = data)
|
78 |
+
if res.status_code != 200:
|
79 |
+
raise ConnectionError
|
80 |
+
|
81 |
+
res = json.loads(res.text)
|
82 |
+
return res
|
83 |
+
|
84 |
+
def _process_data(self,res):
|
85 |
+
if res is None:
|
86 |
+
return res
|
87 |
+
else:
|
88 |
+
return pd.DataFrame(res["announcements"])
|
89 |
+
|
90 |
+
def _get_pdfs(self,x, save_dir, delate_pdf,pbar):
|
91 |
+
os.makedirs(save_dir, exist_ok= True)
|
92 |
+
adjunctUrl = x.adjunctUrl
|
93 |
+
pdf_base_url = "http://static.cninfo.com.cn/"
|
94 |
+
pdf_url = pdf_base_url + adjunctUrl
|
95 |
+
responsepdf = self._request_get(pdf_url)
|
96 |
+
|
97 |
+
|
98 |
+
if responsepdf is None:
|
99 |
+
pbar.update(1)
|
100 |
+
return ("Failed Download","Failed Download")
|
101 |
+
|
102 |
+
else:
|
103 |
+
# make preparations
|
104 |
+
file_name = x.announcementTitle
|
105 |
+
file_name = "".join(file_name.split("<em>"))
|
106 |
+
file_name = "".join(file_name.split("</em>"))
|
107 |
+
file_name
|
108 |
+
file_name = f"{x.secCode}_{x.secName}_{file_name}.pdf"
|
109 |
+
file_path = os.path.join(save_dir, file_name)
|
110 |
+
|
111 |
+
# save pdf
|
112 |
+
with open(file_path, "wb") as f:
|
113 |
+
f.write(responsepdf.content)
|
114 |
+
|
115 |
+
# analyze pdf
|
116 |
+
with open(file_path, "rb") as filehandle:
|
117 |
+
pdf = PdfReader(filehandle)
|
118 |
+
text_all = ""
|
119 |
+
for page in pdf.pages:
|
120 |
+
text = page.extract_text()
|
121 |
+
text = "".join(text.split("\n"))
|
122 |
+
text_all += text
|
123 |
+
pbar.update(1)
|
124 |
+
|
125 |
+
if delate_pdf:
|
126 |
+
os.remove(file_path)
|
127 |
+
return ("removed", text_all)
|
128 |
+
else:
|
129 |
+
return (file_path, text_all)
|
130 |
+
|
131 |
+
def _get_orgid(self):
|
132 |
+
org_dict = {}
|
133 |
+
org_json = self._request_get("http://www.cninfo.com.cn/new/data/szse_stock.json").json()["stockList"]
|
134 |
+
|
135 |
+
for i in range(len(org_json)):
|
136 |
+
org_dict[org_json[i]["code"]] = org_json[i]["orgId"]
|
137 |
+
|
138 |
+
return org_dict
|
finnlp/data_sources/company_announcement/sec.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
from lxml import etree
|
5 |
+
import pandas as pd
|
6 |
+
import requests
|
7 |
+
import json
|
8 |
+
import time
|
9 |
+
|
10 |
+
class SEC_Annoumcement(Company_Announcement_Downloader):
|
11 |
+
|
12 |
+
def __init__(self, args = {}):
|
13 |
+
super().__init__(args)
|
14 |
+
self.dataframe = pd.DataFrame()
|
15 |
+
|
16 |
+
def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1):
|
17 |
+
entityName = self._get_entity_name(stock)
|
18 |
+
# first page
|
19 |
+
total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay)
|
20 |
+
# other pages
|
21 |
+
if total_pages>1:
|
22 |
+
for page in tqdm(range(1, total_pages), desc="Downloading other page..."):
|
23 |
+
self._gather_one_page(start_date, end_date, page + 1, entityName, delay )
|
24 |
+
|
25 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
26 |
+
|
27 |
+
def _get_entity_name(self, stock = "AAPL"):
|
28 |
+
url = "https://efts.sec.gov/LATEST/search-index"
|
29 |
+
headers = {
|
30 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
31 |
+
}
|
32 |
+
params = {
|
33 |
+
"keysTyped":stock
|
34 |
+
}
|
35 |
+
resp = self._request_get(url = url, headers= headers, params= params)
|
36 |
+
if resp is None:
|
37 |
+
raise ConnectionError("Can't get entity name")
|
38 |
+
|
39 |
+
res = json.loads(resp.text)
|
40 |
+
item_list = res["hits"]["hits"]
|
41 |
+
entityName_list = []
|
42 |
+
for item in item_list:
|
43 |
+
c_name_one = item["_source"]["entity_words"]
|
44 |
+
c_name_two = item["_id"].zfill(10)
|
45 |
+
entityName = f"{c_name_one} (CIK {c_name_two})"
|
46 |
+
entityName_list.append(entityName)
|
47 |
+
|
48 |
+
entityName = entityName_list[0]
|
49 |
+
|
50 |
+
return entityName
|
51 |
+
|
52 |
+
def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01):
|
53 |
+
from_ = (page-1)*100
|
54 |
+
url = "https://efts.sec.gov/LATEST/search-index"
|
55 |
+
headers = {
|
56 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
57 |
+
}
|
58 |
+
params = {
|
59 |
+
"dateRange": "all",
|
60 |
+
"entityName": entityName,
|
61 |
+
"startdt": start_date,
|
62 |
+
"enddt": end_date,
|
63 |
+
"from" : from_,
|
64 |
+
"page" : page,
|
65 |
+
}
|
66 |
+
|
67 |
+
resp = self._request_get(url = url, headers= headers, params= params)
|
68 |
+
|
69 |
+
if resp is None:
|
70 |
+
return 'Error'
|
71 |
+
res = json.loads(resp.text)
|
72 |
+
|
73 |
+
# total
|
74 |
+
total_items = res["hits"]["total"]["value"]
|
75 |
+
if total_items % 100 == 0:
|
76 |
+
total_pages = total_items // 100
|
77 |
+
else:
|
78 |
+
total_pages = total_items // 100 + 1
|
79 |
+
|
80 |
+
items = res["hits"]["hits"]
|
81 |
+
|
82 |
+
url_base = "https://www.sec.gov/Archives/edgar/data"
|
83 |
+
|
84 |
+
for item in tqdm(items, desc="Downloading by item..." ):
|
85 |
+
url_third = item["_source"]["xsl"]
|
86 |
+
url_second, url_fourth = item["_id"].split(":")
|
87 |
+
url_second = url_second.split("-")
|
88 |
+
url_first = url_second[0]
|
89 |
+
url_first = url_first.strip("0")
|
90 |
+
url_second = ''.join(url_second)
|
91 |
+
url_first, url_second, url_fourth
|
92 |
+
|
93 |
+
if url_third is not None:
|
94 |
+
url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}"
|
95 |
+
else:
|
96 |
+
url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}"
|
97 |
+
respn = self._request_get(url = url_new, headers= headers)
|
98 |
+
if respn is None:
|
99 |
+
continue
|
100 |
+
try:
|
101 |
+
res = etree.HTML(respn.text)
|
102 |
+
content = res.xpath("/html/body//text()")
|
103 |
+
content = [c for c in content if c != "\n"]
|
104 |
+
content = "".join(content)
|
105 |
+
|
106 |
+
_id = item["_id"]
|
107 |
+
ciks = item["_source"]["ciks"]
|
108 |
+
period_ending = item["_source"]["period_ending"]
|
109 |
+
root_form = item["_source"]["root_form"]
|
110 |
+
file_num = item["_source"]["file_num"]
|
111 |
+
display_names = item["_source"]["display_names"]
|
112 |
+
xsl = item["_source"]["xsl"]
|
113 |
+
sequence = item["_source"]["sequence"]
|
114 |
+
file_date = item["_source"]["file_date"]
|
115 |
+
biz_states = item["_source"]["biz_states"]
|
116 |
+
sics = item["_source"]["sics"]
|
117 |
+
form = item["_source"]["form"]
|
118 |
+
adsh = item["_source"]["adsh"]
|
119 |
+
film_num = item["_source"]["film_num"]
|
120 |
+
biz_locations = item["_source"]["biz_locations"]
|
121 |
+
file_type = item["_source"]["file_type"]
|
122 |
+
file_description = item["_source"]["file_description"]
|
123 |
+
inc_states = item["_source"]["inc_states"]
|
124 |
+
ite = item["_source"]["items"]
|
125 |
+
|
126 |
+
data = [
|
127 |
+
_id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence,
|
128 |
+
file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type,
|
129 |
+
file_description, inc_states, ite, content
|
130 |
+
]
|
131 |
+
columns = [
|
132 |
+
"_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence",
|
133 |
+
"file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type",
|
134 |
+
"file_description", "inc_states", "ite", "content"
|
135 |
+
]
|
136 |
+
tmp = pd.DataFrame(data = data).T
|
137 |
+
tmp.columns = columns
|
138 |
+
|
139 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
140 |
+
time.sleep(delay)
|
141 |
+
except:
|
142 |
+
continue
|
143 |
+
|
144 |
+
return total_pages
|
145 |
+
|
finnlp/data_sources/company_announcement/sina.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import requests
|
3 |
+
from lxml import etree
|
4 |
+
from tqdm.notebook import tqdm
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
class Sina_Annoumcement_Downloader:
|
8 |
+
|
9 |
+
def __init__(self, args = {}):
|
10 |
+
pass
|
11 |
+
|
12 |
+
def download(self, stock = "all",max_page = 100):
|
13 |
+
page = 0
|
14 |
+
df = pd.DataFrame()
|
15 |
+
print(f"Getting page: ",end = "")
|
16 |
+
while page < max_page:
|
17 |
+
print(page, end = " ")
|
18 |
+
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
19 |
+
'Accept-Encoding':'gzip, deflate, br',}
|
20 |
+
url = f"https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={stock}&Page={page}"
|
21 |
+
response = requests.get(url = url,headers=headers)
|
22 |
+
# response.encoding = "GBK"
|
23 |
+
# print(response.content.decode('GBK'))
|
24 |
+
text = response.content.decode('GBK')
|
25 |
+
html = etree.HTML(text)
|
26 |
+
|
27 |
+
# get announcement date
|
28 |
+
date_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/text()")
|
29 |
+
if len(date_list) <= 0:
|
30 |
+
break
|
31 |
+
date_list = [date.strip('.\r').strip('.\n').strip('.\xa0').strip(' ') for date in date_list]
|
32 |
+
date_list = [date for date in date_list if len(date) == 10]
|
33 |
+
|
34 |
+
|
35 |
+
# get headlines and urls
|
36 |
+
url_root = "https://vip.stock.finance.sina.com.cn"
|
37 |
+
a_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/a")
|
38 |
+
headline_list = [a.xpath("./text()")[0] for a in a_list ]
|
39 |
+
url_list = [url_root + a.xpath("./@href")[0] for a in a_list ]
|
40 |
+
|
41 |
+
tmp_df = {
|
42 |
+
"date": date_list,
|
43 |
+
"headline": headline_list,
|
44 |
+
"url": url_list,
|
45 |
+
}
|
46 |
+
tmp_df = pd.DataFrame(tmp_df)
|
47 |
+
df = pd.concat([df,tmp_df])
|
48 |
+
page += 1
|
49 |
+
|
50 |
+
|
51 |
+
with tqdm(total = df.shape[0],desc = "Getting annoumcement content" ) as pbar:
|
52 |
+
df["content"] = df.apply(lambda x: self.get_content(x,pbar), axis=1 )
|
53 |
+
|
54 |
+
df = df.reset_index(drop=True)
|
55 |
+
|
56 |
+
return df
|
57 |
+
|
58 |
+
def get_content(self,x,pbar,delay = 0.1):
|
59 |
+
time.sleep(delay)
|
60 |
+
url = x.url
|
61 |
+
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
62 |
+
'Accept-Encoding':'gzip, deflate, br',}
|
63 |
+
response = requests.get(url = url,headers=headers)
|
64 |
+
if response.status_code == 200:
|
65 |
+
try:
|
66 |
+
text = response.content.decode('GBK')
|
67 |
+
html = etree.HTML(text)
|
68 |
+
|
69 |
+
# clean content
|
70 |
+
content_list = html.xpath("//*[@id='content']//text()")
|
71 |
+
content_list = [content.strip('.\t').strip('.\n').strip('.\r') for content in content_list]
|
72 |
+
content_list = [content for content in content_list if len(content) != 0]
|
73 |
+
content = "".join(content_list)
|
74 |
+
except:
|
75 |
+
return "can't get content"
|
76 |
+
else:
|
77 |
+
return "can't get content"
|
78 |
+
|
79 |
+
pbar.update(1)
|
80 |
+
|
81 |
+
return content
|
82 |
+
|
83 |
+
def clean_data(self):
|
84 |
+
pass
|
85 |
+
|
86 |
+
def transfer_standard_date_to_nonstandard(self,date):
|
87 |
+
pass
|