kristada673 commited on
Commit
9572c06
·
1 Parent(s): 74231b9

Upload 5 files

Browse files
finnlp/data_sources/company_announcement/_base.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources._base import FinNLP_Downloader
2
+
3
+ class Company_Announcement_Downloader(FinNLP_Downloader):
4
+
5
+ def __init__(self, args = {}):
6
+ super().__init__(args)
7
+ pass
8
+
9
+ def download_date_range_all(self, start_date, end_date):
10
+ pass
11
+
12
+ def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
13
+ pass
14
+
15
+ def download_streaming_all(self, rounds = 3):
16
+ pass
17
+
18
+ def download_streaming_stock(self, stock = None, rounds = 3):
19
+ pass
20
+
21
+ def clean_data(self):
22
+ pass
finnlp/data_sources/company_announcement/juchao.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
2
+
3
+ import requests
4
+ import time
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+ from PyPDF2 import PdfReader
10
+
11
+ class Juchao_Annoumcement(Company_Announcement_Downloader):
12
+
13
+ def __init__(self, args = {}):
14
+ super().__init__(args)
15
+ self.dataframe = pd.DataFrame()
16
+
17
+ def download_date_range_stock(self,start_date, end_date, stock = "000001",max_page = 100, searchkey= "", get_content = False, save_dir = "./tmp/" , delate_pdf = False):
18
+ self.org_dict = self._get_orgid()
19
+
20
+ # download the first page
21
+ res = self._get_open_page(start_date, end_date, stock, 1, searchkey)
22
+ total_pages = res["totalpages"]+1
23
+
24
+ if res["announcements"] is None:
25
+ print(f"Nothing related to your searchkey({searchkey}) is found, you may try another one or just leave it blank")
26
+ else:
27
+ tmp_df = self._process_data(res)
28
+ self.dataframe = pd.concat([self.dataframe, tmp_df])
29
+
30
+ page = 2
31
+ # download other page
32
+ pbar = tqdm(total=total_pages,desc="Downloading by page...")
33
+
34
+ for _ in range(max_page):
35
+ res = self._get_open_page(start_date, end_date, stock, page, searchkey)
36
+ if res["announcements"] is None:
37
+ break
38
+ tmp_df = self._process_data(res)
39
+ self.dataframe = pd.concat([self.dataframe, tmp_df])
40
+ pbar.update(1)
41
+ page += 1
42
+ pbar.update(1)
43
+ # Convert Time
44
+ self.dataframe.announcementTime = self.dataframe.announcementTime.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000)))
45
+ self.dataframe.announcementTime = pd.to_datetime(self.dataframe.announcementTime)
46
+
47
+ if get_content:
48
+ pbar = tqdm(total=self.dataframe.shape[0], desc="Getting the text data...")
49
+ self.dataframe[["PDF_path","Content"]] = self.dataframe.apply(lambda x: self._get_pdfs(x,save_dir, delate_pdf, pbar),axis= 1,result_type = "expand")
50
+ if delate_pdf:
51
+ os.removedirs(save_dir)
52
+
53
+ self.dataframe = self.dataframe.reset_index(drop = True)
54
+
55
+ def _get_open_page(self,start_date,end_date, stock,page, searchkey):
56
+ url = "http://www.cninfo.com.cn/new/hisAnnouncement/query?"
57
+ headers = {
58
+ "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index",
59
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
60
+ }
61
+ data = {
62
+ "pageNum": page,
63
+ "pageSize": "30",
64
+ "column": "szse",
65
+ "tabName": "fulltext",
66
+ "plate":"",
67
+ "stock":stock + "," + self.org_dict[stock] ,
68
+ "searchkey": searchkey,
69
+ "secid":"",
70
+ "category":"",
71
+ "trade":"",
72
+ "seDate": f"{start_date}~{end_date}",
73
+ "sortName": "",
74
+ "sortType": "",
75
+ "isHLtitle": "true",
76
+ }
77
+ res = requests.post(url = url, headers = headers, data = data)
78
+ if res.status_code != 200:
79
+ raise ConnectionError
80
+
81
+ res = json.loads(res.text)
82
+ return res
83
+
84
+ def _process_data(self,res):
85
+ if res is None:
86
+ return res
87
+ else:
88
+ return pd.DataFrame(res["announcements"])
89
+
90
+ def _get_pdfs(self,x, save_dir, delate_pdf,pbar):
91
+ os.makedirs(save_dir, exist_ok= True)
92
+ adjunctUrl = x.adjunctUrl
93
+ pdf_base_url = "http://static.cninfo.com.cn/"
94
+ pdf_url = pdf_base_url + adjunctUrl
95
+ responsepdf = self._request_get(pdf_url)
96
+
97
+
98
+ if responsepdf is None:
99
+ pbar.update(1)
100
+ return ("Failed Download","Failed Download")
101
+
102
+ else:
103
+ # make preparations
104
+ file_name = x.announcementTitle
105
+ file_name = "".join(file_name.split("<em>"))
106
+ file_name = "".join(file_name.split("</em>"))
107
+ file_name
108
+ file_name = f"{x.secCode}_{x.secName}_{file_name}.pdf"
109
+ file_path = os.path.join(save_dir, file_name)
110
+
111
+ # save pdf
112
+ with open(file_path, "wb") as f:
113
+ f.write(responsepdf.content)
114
+
115
+ # analyze pdf
116
+ with open(file_path, "rb") as filehandle:
117
+ pdf = PdfReader(filehandle)
118
+ text_all = ""
119
+ for page in pdf.pages:
120
+ text = page.extract_text()
121
+ text = "".join(text.split("\n"))
122
+ text_all += text
123
+ pbar.update(1)
124
+
125
+ if delate_pdf:
126
+ os.remove(file_path)
127
+ return ("removed", text_all)
128
+ else:
129
+ return (file_path, text_all)
130
+
131
+ def _get_orgid(self):
132
+ org_dict = {}
133
+ org_json = self._request_get("http://www.cninfo.com.cn/new/data/szse_stock.json").json()["stockList"]
134
+
135
+ for i in range(len(org_json)):
136
+ org_dict[org_json[i]["code"]] = org_json[i]["orgId"]
137
+
138
+ return org_dict
finnlp/data_sources/company_announcement/sec.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader
2
+
3
+ from tqdm import tqdm
4
+ from lxml import etree
5
+ import pandas as pd
6
+ import requests
7
+ import json
8
+ import time
9
+
10
+ class SEC_Annoumcement(Company_Announcement_Downloader):
11
+
12
+ def __init__(self, args = {}):
13
+ super().__init__(args)
14
+ self.dataframe = pd.DataFrame()
15
+
16
+ def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1):
17
+ entityName = self._get_entity_name(stock)
18
+ # first page
19
+ total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay)
20
+ # other pages
21
+ if total_pages>1:
22
+ for page in tqdm(range(1, total_pages), desc="Downloading other page..."):
23
+ self._gather_one_page(start_date, end_date, page + 1, entityName, delay )
24
+
25
+ self.dataframe = self.dataframe.reset_index(drop = True)
26
+
27
+ def _get_entity_name(self, stock = "AAPL"):
28
+ url = "https://efts.sec.gov/LATEST/search-index"
29
+ headers = {
30
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
31
+ }
32
+ params = {
33
+ "keysTyped":stock
34
+ }
35
+ resp = self._request_get(url = url, headers= headers, params= params)
36
+ if resp is None:
37
+ raise ConnectionError("Can't get entity name")
38
+
39
+ res = json.loads(resp.text)
40
+ item_list = res["hits"]["hits"]
41
+ entityName_list = []
42
+ for item in item_list:
43
+ c_name_one = item["_source"]["entity_words"]
44
+ c_name_two = item["_id"].zfill(10)
45
+ entityName = f"{c_name_one} (CIK {c_name_two})"
46
+ entityName_list.append(entityName)
47
+
48
+ entityName = entityName_list[0]
49
+
50
+ return entityName
51
+
52
+ def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01):
53
+ from_ = (page-1)*100
54
+ url = "https://efts.sec.gov/LATEST/search-index"
55
+ headers = {
56
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
57
+ }
58
+ params = {
59
+ "dateRange": "all",
60
+ "entityName": entityName,
61
+ "startdt": start_date,
62
+ "enddt": end_date,
63
+ "from" : from_,
64
+ "page" : page,
65
+ }
66
+
67
+ resp = self._request_get(url = url, headers= headers, params= params)
68
+
69
+ if resp is None:
70
+ return 'Error'
71
+ res = json.loads(resp.text)
72
+
73
+ # total
74
+ total_items = res["hits"]["total"]["value"]
75
+ if total_items % 100 == 0:
76
+ total_pages = total_items // 100
77
+ else:
78
+ total_pages = total_items // 100 + 1
79
+
80
+ items = res["hits"]["hits"]
81
+
82
+ url_base = "https://www.sec.gov/Archives/edgar/data"
83
+
84
+ for item in tqdm(items, desc="Downloading by item..." ):
85
+ url_third = item["_source"]["xsl"]
86
+ url_second, url_fourth = item["_id"].split(":")
87
+ url_second = url_second.split("-")
88
+ url_first = url_second[0]
89
+ url_first = url_first.strip("0")
90
+ url_second = ''.join(url_second)
91
+ url_first, url_second, url_fourth
92
+
93
+ if url_third is not None:
94
+ url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}"
95
+ else:
96
+ url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}"
97
+ respn = self._request_get(url = url_new, headers= headers)
98
+ if respn is None:
99
+ continue
100
+ try:
101
+ res = etree.HTML(respn.text)
102
+ content = res.xpath("/html/body//text()")
103
+ content = [c for c in content if c != "\n"]
104
+ content = "".join(content)
105
+
106
+ _id = item["_id"]
107
+ ciks = item["_source"]["ciks"]
108
+ period_ending = item["_source"]["period_ending"]
109
+ root_form = item["_source"]["root_form"]
110
+ file_num = item["_source"]["file_num"]
111
+ display_names = item["_source"]["display_names"]
112
+ xsl = item["_source"]["xsl"]
113
+ sequence = item["_source"]["sequence"]
114
+ file_date = item["_source"]["file_date"]
115
+ biz_states = item["_source"]["biz_states"]
116
+ sics = item["_source"]["sics"]
117
+ form = item["_source"]["form"]
118
+ adsh = item["_source"]["adsh"]
119
+ film_num = item["_source"]["film_num"]
120
+ biz_locations = item["_source"]["biz_locations"]
121
+ file_type = item["_source"]["file_type"]
122
+ file_description = item["_source"]["file_description"]
123
+ inc_states = item["_source"]["inc_states"]
124
+ ite = item["_source"]["items"]
125
+
126
+ data = [
127
+ _id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence,
128
+ file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type,
129
+ file_description, inc_states, ite, content
130
+ ]
131
+ columns = [
132
+ "_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence",
133
+ "file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type",
134
+ "file_description", "inc_states", "ite", "content"
135
+ ]
136
+ tmp = pd.DataFrame(data = data).T
137
+ tmp.columns = columns
138
+
139
+ self.dataframe = pd.concat([self.dataframe, tmp])
140
+ time.sleep(delay)
141
+ except:
142
+ continue
143
+
144
+ return total_pages
145
+
finnlp/data_sources/company_announcement/sina.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import requests
3
+ from lxml import etree
4
+ from tqdm.notebook import tqdm
5
+ import pandas as pd
6
+
7
+ class Sina_Annoumcement_Downloader:
8
+
9
+ def __init__(self, args = {}):
10
+ pass
11
+
12
+ def download(self, stock = "all",max_page = 100):
13
+ page = 0
14
+ df = pd.DataFrame()
15
+ print(f"Getting page: ",end = "")
16
+ while page < max_page:
17
+ print(page, end = " ")
18
+ headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
19
+ 'Accept-Encoding':'gzip, deflate, br',}
20
+ url = f"https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={stock}&Page={page}"
21
+ response = requests.get(url = url,headers=headers)
22
+ # response.encoding = "GBK"
23
+ # print(response.content.decode('GBK'))
24
+ text = response.content.decode('GBK')
25
+ html = etree.HTML(text)
26
+
27
+ # get announcement date
28
+ date_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/text()")
29
+ if len(date_list) <= 0:
30
+ break
31
+ date_list = [date.strip('.\r').strip('.\n').strip('.\xa0').strip(' ') for date in date_list]
32
+ date_list = [date for date in date_list if len(date) == 10]
33
+
34
+
35
+ # get headlines and urls
36
+ url_root = "https://vip.stock.finance.sina.com.cn"
37
+ a_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/a")
38
+ headline_list = [a.xpath("./text()")[0] for a in a_list ]
39
+ url_list = [url_root + a.xpath("./@href")[0] for a in a_list ]
40
+
41
+ tmp_df = {
42
+ "date": date_list,
43
+ "headline": headline_list,
44
+ "url": url_list,
45
+ }
46
+ tmp_df = pd.DataFrame(tmp_df)
47
+ df = pd.concat([df,tmp_df])
48
+ page += 1
49
+
50
+
51
+ with tqdm(total = df.shape[0],desc = "Getting annoumcement content" ) as pbar:
52
+ df["content"] = df.apply(lambda x: self.get_content(x,pbar), axis=1 )
53
+
54
+ df = df.reset_index(drop=True)
55
+
56
+ return df
57
+
58
+ def get_content(self,x,pbar,delay = 0.1):
59
+ time.sleep(delay)
60
+ url = x.url
61
+ headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
62
+ 'Accept-Encoding':'gzip, deflate, br',}
63
+ response = requests.get(url = url,headers=headers)
64
+ if response.status_code == 200:
65
+ try:
66
+ text = response.content.decode('GBK')
67
+ html = etree.HTML(text)
68
+
69
+ # clean content
70
+ content_list = html.xpath("//*[@id='content']//text()")
71
+ content_list = [content.strip('.\t').strip('.\n').strip('.\r') for content in content_list]
72
+ content_list = [content for content in content_list if len(content) != 0]
73
+ content = "".join(content_list)
74
+ except:
75
+ return "can't get content"
76
+ else:
77
+ return "can't get content"
78
+
79
+ pbar.update(1)
80
+
81
+ return content
82
+
83
+ def clean_data(self):
84
+ pass
85
+
86
+ def transfer_standard_date_to_nonstandard(self,date):
87
+ pass