kristada673 commited on
Commit
ed0a845
·
1 Parent(s): 0c48058

Upload 7 files

Browse files
finnlp/data_sources/social_media/_base.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources._base import FinNLP_Downloader
2
+
3
+ class Social_Media_Downloader(FinNLP_Downloader):
4
+
5
+ def __init__(self, args = {}):
6
+ super().__init__(args)
7
+ pass
8
+
9
+ def download(self, start_date, end_date, stock = "all"):
10
+ pass
11
+
12
+ def clean_data(self):
13
+ pass
14
+
15
+ def gather_one_day_news(self,date,stock = "all",delay = 0.1):
16
+ pass
17
+
18
+ def transfer_standard_date_to_nonstandard(self,date):
19
+ pass
finnlp/data_sources/social_media/finnhub_sentiment.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
2
+ from tqdm.notebook import tqdm
3
+ import pandas as pd
4
+ import finnhub
5
+ import time
6
+
7
+ class Finnhub_Sentiment(Social_Media_Downloader):
8
+ def __init__(self, args = {}):
9
+ super().__init__(args)
10
+ assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
11
+ self.finnhub_client = finnhub.Client(api_key=args["token"])
12
+ self.delay = args["delay"] if "dalay" in args.keys() else 0.7
13
+
14
+ def download_sentiment(self, start_date, end_date, stock = "APPL"):
15
+ self.reddit = pd.DataFrame()
16
+ self.twitter = pd.DataFrame()
17
+ self.date_list = pd.date_range(start_date,end_date)
18
+ days_each_time = 4
19
+ date_list = self.date_list
20
+ # cal total lenth
21
+ if len(date_list)%days_each_time == 0:
22
+ total = len(date_list)//days_each_time
23
+ else:
24
+ total = len(date_list)//days_each_time+1
25
+ with tqdm(total=total) as bar:
26
+ while len(date_list):
27
+ tmp_date_list = date_list[:days_each_time]
28
+ date_list = date_list[days_each_time:]
29
+ tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
30
+ tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
31
+ reddit, _stock_name, twitter = self.gather_one_day_sentiment(tmp_start_date,tmp_end_date,stock = stock )
32
+ self.reddit = pd.concat([self.reddit,reddit])
33
+ self.twitter = pd.concat([self.twitter,twitter])
34
+ bar.update(1)
35
+ self.reddit = self.reddit.sort_values("atTime")
36
+ self.twitter = self.twitter.sort_values("atTime")
37
+
38
+ def gather_one_day_sentiment(self,start_date, end_date, stock = "APPL"):
39
+ res = self.finnhub_client.stock_social_sentiment(stock, _from=start_date, to=end_date)
40
+ reddit = res["reddit"]
41
+ symbol = res["symbol"]
42
+ twitter = res["twitter"]
43
+ reddit = pd.DataFrame(reddit)
44
+ # print(reddit)
45
+
46
+ twitter = pd.DataFrame(twitter)
47
+ try:
48
+ reddit["atTime"] = pd.to_datetime(reddit["atTime"],errors = "ignore")
49
+ twitter["atTime"] = pd.to_datetime(twitter["atTime"],errors = "ignore")
50
+ except:
51
+ pass
52
+ time.sleep(self.delay)
53
+ return reddit,symbol,twitter
finnlp/data_sources/social_media/reddit_streaming.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
2
+
3
+ from tqdm import tqdm
4
+ from lxml import etree
5
+ import requests
6
+ import pandas as pd
7
+ import json
8
+ import base64
9
+
10
+ class Reddit_Streaming(Social_Media_Downloader):
11
+
12
+ def __init__(self, args = {}):
13
+ super().__init__(args)
14
+ self.dataframe = pd.DataFrame()
15
+
16
+ def download_streaming_all(self, rounds = 3):
17
+ # Download the first page by url
18
+ base_url = "https://www.reddit.com/r/wallstreetbets/new/"
19
+ pbar = tqdm(total= rounds, desc= "Downloading by pages...")
20
+ res = self._request_get(base_url)
21
+ if res is None:
22
+ raise ConnectionError
23
+
24
+ # get the info from init page
25
+ html = etree.HTML(res.text)
26
+ init = html.xpath("//*[@id='data']/text()")[0]
27
+ init = json.loads(init[14:][:-1])
28
+ init = init["posts"]["models"]
29
+ tmp_df = pd.DataFrame(init).T.reset_index(drop = True)
30
+ self.dataframe = tmp_df
31
+ init = [i for i in init if len(i)< 12]
32
+ last_id = init[-1]
33
+ last_id = self._encode_base64(last_id)
34
+
35
+ pbar.update(1)
36
+
37
+ # fetch other pages
38
+ if rounds > 1:
39
+ for _ in range(1,rounds):
40
+ last_id = self._fatch_other_pages(last_id, pbar)
41
+
42
+ def _fatch_other_pages(self, last_page, pbar):
43
+ url = 'https://gql.reddit.com/'
44
+ headers = {
45
+ "referer":"https://www.reddit.com/",
46
+ "authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA",
47
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
48
+ }
49
+ data = {
50
+ "id": "02e3b6d0d0d7",
51
+ "variables": {
52
+ "name": "wallstreetbets",
53
+ "includeIdentity": False,
54
+ "adContext": {
55
+ "layout": "CARD",
56
+ "clientSignalSessionData": {
57
+ "adsSeenCount": 4,
58
+ "totalPostsSeenCount": 79,
59
+ "sessionStartTime": "2023-04-07T15:32:13.933Z",
60
+ }
61
+ },
62
+ "isFake": False,
63
+ "includeAppliedFlair": False,
64
+ "includeDevPlatformMetadata": True,
65
+ "includeRecents": False,
66
+ "includeTrending": False,
67
+ "includeSubredditRankings": True,
68
+ "includeSubredditChannels": False,
69
+ "isAdHocMulti": False,
70
+ "isAll": False,
71
+ "isLoggedOutGatedOptedin": False,
72
+ "isLoggedOutQuarantineOptedin": False,
73
+ "isPopular": False,
74
+ "recentPostIds": [],
75
+ "subredditNames": [],
76
+ "sort": "NEW",
77
+ "pageSize": 25,
78
+ "after": last_page
79
+ }
80
+ }
81
+ response = self._request_post(url = url, headers= headers, json = data)
82
+ data = json.loads(response.text)
83
+ data = data["data"]["subredditInfoByName"]["elements"]["edges"]
84
+ for d in data:
85
+ if d["node"]["__typename"] == "SubredditPost":
86
+ tmp = pd.DataFrame(d).T
87
+ self.dataframe = pd.concat([self.dataframe, tmp])
88
+ last_id = tmp.id.values[0]
89
+
90
+ last_id = self._encode_base64(last_id)
91
+ pbar.update(1)
92
+
93
+ return last_id
94
+
95
+ def _encode_base64(self,id):
96
+ return base64.b64encode(id.encode('utf-8')).decode()
finnlp/data_sources/social_media/stocktwits_streaming.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
2
+
3
+ import requests
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ import json
7
+
8
+ class Stocktwits_Streaming(Social_Media_Downloader):
9
+
10
+ def __init__(self, args = {}):
11
+ super().__init__(args)
12
+ self.dataframe = pd.DataFrame()
13
+
14
+ def download_streaming_stock(self, stock = "AAPL", rounds = 3):
15
+ url = f"https://api.stocktwits.com/api/2/streams/symbol/{stock}.json"
16
+ headers = {
17
+ 'accept': 'application/json',
18
+ 'accept-encoding': 'gzip, deflate, br',
19
+ 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
20
+ 'authorization': 'OAuth 8a881f43cbc7af061ec2aa35deec9b44f7e3cc09',
21
+ 'dnt': '1',
22
+ 'origin': 'https://stocktwits.com',
23
+ 'referer': 'https://stocktwits.com/',
24
+
25
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
26
+ }
27
+ for i in tqdm(range(rounds)):
28
+ if i == 0:
29
+ params = {
30
+ "filter":"top",
31
+ "limit":1000,
32
+ # "max":410000000,
33
+ }
34
+ else:
35
+ params = {
36
+ "filter":"top",
37
+ "limit":1000,
38
+ "max":max,
39
+ }
40
+ response = self._request_get(url = url, headers=headers, params=params)
41
+ if response is None:
42
+ print(f"Fetch data fail. Please check your stock name :{stock} and connections. You may raise an issue if you can't solve this problem")
43
+ continue
44
+ else:
45
+ res = json.loads(response.text)
46
+ max = res["cursor"]["since"]
47
+ res = pd.DataFrame(res["messages"])
48
+ self.dataframe = pd.concat([self.dataframe,res])
49
+
50
+ self.dataframe = self.dataframe.reset_index(drop = True)
finnlp/data_sources/social_media/twitter_date_range.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+
4
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
5
+
6
+ import requests
7
+ from urllib import parse
8
+ from tqdm import tqdm
9
+ from datetime import datetime,timedelta
10
+ import pandas as pd
11
+ import json
12
+ import time
13
+
14
+ class Twitter_Date_Range(Social_Media_Downloader):
15
+
16
+ def __init__(self, args = {}):
17
+ super().__init__(args)
18
+ self.dataframe = pd.DataFrame()
19
+
20
+ def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
21
+ self.date_list = pd.date_range(start_date,end_date)
22
+ res = pd.DataFrame()
23
+ for date in tqdm(self.date_list, desc= "Downloading by day... "):
24
+ tmp = self._gather_one_day(date,stock)
25
+ res = pd.concat([res,tmp])
26
+
27
+ res.created_at = pd.to_datetime(res.created_at)
28
+ res = res.sort_values("created_at")
29
+ res = res.reset_index(drop=True)
30
+ # res = res.query(f"created_at >= @start_date & created_at <= @end_date")
31
+ res = res[res.created_at >= start_date][res.created_at <= end_date]
32
+ res = res.reset_index(drop=True)
33
+ self.dataframe = res
34
+
35
+ def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01):
36
+ time.sleep(delay)
37
+ next_date = date + timedelta(days=1)
38
+ date = datetime.strftime(date, "%Y-%m-%d")
39
+ next_date = datetime.strftime(next_date, "%Y-%m-%d")
40
+
41
+ url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo"
42
+ url_token = 'https://api.twitter.com/1.1/guest/activate.json'
43
+ headers = {
44
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
45
+ 'Accept': '*/*',
46
+ 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
47
+ 'x-guest-token': '',
48
+ 'x-twitter-client-language': 'zh-cn',
49
+ 'x-twitter-active-user': 'yes',
50
+ 'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733',
51
+ 'Sec-Fetch-Dest': 'empty',
52
+ 'Sec-Fetch-Mode': 'cors',
53
+ 'Sec-Fetch-Site': 'same-origin',
54
+ 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
55
+ 'Referer': 'https://twitter.com/',
56
+ 'Connection': 'keep-alive',
57
+ }
58
+
59
+ q = f'{stock} until:{next_date} since:{date}'
60
+ token = json.loads(requests.post(url_token, headers = headers).text)['guest_token']
61
+ print(token)
62
+ headers['x-guest-token'] = token
63
+ url = url.format(parse.quote(q))
64
+ print(url)
65
+ res = self._request_get(url, headers = headers)
66
+ print(res)
67
+ if res is not None:
68
+ try:
69
+ res = json.loads(res.text)
70
+ res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at")
71
+ except:
72
+ res = pd.DataFrame()
73
+ else:
74
+ res = pd.DataFrame()
75
+
76
+ return res
finnlp/data_sources/social_media/weibo_date_range.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
2
+
3
+ from tqdm import tqdm
4
+ from lxml import etree
5
+ import pandas as pd
6
+ import numpy as np
7
+ import requests
8
+ import datetime
9
+ import time
10
+ import json
11
+ import re
12
+
13
+ class Weibo_Date_Range(Social_Media_Downloader):
14
+ def __init__(self, args = {}):
15
+ super().__init__(args)
16
+ if "cookies" not in args.keys():
17
+ raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
18
+ self.cookies = args["cookies"]
19
+ self.dataframe = pd.DataFrame()
20
+
21
+ def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01):
22
+ self.date_list = pd.date_range(start_date, end_date)
23
+ for date in tqdm(self.date_list, desc = "Downloading by dates..."):
24
+ date = date.strftime("%Y-%m-%d")
25
+ self._gather_one_day(date, start_hour, end_hour, stock, delay)
26
+ self.dataframe = self.dataframe.reset_index(drop = True)
27
+
28
+ def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01):
29
+ if start_hour == 0 and end_hour == 0:
30
+ start_date = datetime.datetime.strptime(date, "%Y-%m-%d")
31
+ end_date = start_date + datetime.timedelta(days=1)
32
+ start_date = start_date.strftime("%Y-%m-%d")
33
+ end_date = end_date.strftime("%Y-%m-%d")
34
+ else:
35
+ start_date = date, end_date = date
36
+
37
+ # first page
38
+ all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay)
39
+ # another pages
40
+ if len(all_urls)>1:
41
+ base_url= "https://s.weibo.com/"
42
+ for url_new in all_urls:
43
+ url_new = base_url + url_new
44
+ self._gather_other_pages(date, url_new, delay)
45
+
46
+ def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01):
47
+
48
+ headers = {
49
+ "cookie": self.cookies,
50
+ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
51
+ }
52
+
53
+ params = {
54
+ "q": stock,
55
+ "typeall": "1",
56
+ "suball": "1",
57
+ "timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}",
58
+ "Refer":"g",
59
+ "page":"1"
60
+ }
61
+
62
+ url = f"https://s.weibo.com/weibo"
63
+ resp = self._request_get(url, headers=headers, params = params)
64
+
65
+ if resp is None:
66
+ return "Error"
67
+
68
+ if "passport.weibo.com" in resp.url:
69
+ raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
70
+
71
+ res = etree.HTML(resp.content)
72
+ # get all pages
73
+ all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
74
+ items = res.xpath('//div[@class="card-wrap"]')
75
+ for i in items:
76
+ ps = i.xpath('.//div[@class="content"]//p')
77
+ try:
78
+ content = ps[0].xpath(".//text()")
79
+ content = ''.join(content)
80
+ content = content.replace('\n',"")
81
+ content = content.replace(' ',"")
82
+ content = content.replace('\u200b',"")
83
+ except:
84
+ continue
85
+
86
+ info = ps[1].xpath(".//text()")
87
+ try:
88
+ date_content = info[1]
89
+ date_content = date_content.replace('\n',"")
90
+ date_content = date_content.replace(' ',"")
91
+ except:
92
+ date_content = np.nan
93
+
94
+ try:
95
+ source = info[3]
96
+ except:
97
+ source = np.nan
98
+
99
+ tmp = pd.DataFrame([start_date, date_content, source, content]).T
100
+ tmp.columns = ["date","date_content", "source", "content"]
101
+ self.dataframe = pd.concat([self.dataframe, tmp])
102
+
103
+ time.sleep(delay)
104
+
105
+ return all_pages
106
+
107
+ def _gather_other_pages(self, date, url, delay = 0.01):
108
+
109
+ headers = {
110
+ "cookie": self.cookies,
111
+ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
112
+ }
113
+
114
+ resp = self._request_get(url, headers=headers)
115
+
116
+ if resp is None:
117
+ return "Error"
118
+
119
+ if "passport.weibo.com" in resp.url:
120
+ raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
121
+
122
+ res = etree.HTML(resp.content)
123
+ # get all pages
124
+ all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
125
+ items = res.xpath('//div[@class="card-wrap"]')
126
+ for i in items:
127
+ ps = i.xpath('.//div[@class="content"]//p')
128
+ try:
129
+ content = ps[0].xpath(".//text()")
130
+ content = ''.join(content)
131
+ content = content.replace('\n',"")
132
+ content = content.replace(' ',"")
133
+ content = content.replace('\u200b',"")
134
+ except:
135
+ continue
136
+
137
+ info = ps[1].xpath(".//text()")
138
+ try:
139
+ date_content = info[1]
140
+ date_content = date_content.replace('\n',"")
141
+ date_content = date_content.replace(' ',"")
142
+ except:
143
+ date_content = np.nan
144
+
145
+ try:
146
+ source = info[3]
147
+ except:
148
+ source = np.nan
149
+
150
+ tmp = pd.DataFrame([date, date_content, source, content]).T
151
+ tmp.columns = ["date", "date_content", "source", "content"]
152
+ self.dataframe = pd.concat([self.dataframe, tmp])
153
+
154
+ time.sleep(delay)
finnlp/data_sources/social_media/weibo_streaming.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.data_sources.social_media._base import Social_Media_Downloader
2
+
3
+ from tqdm import tqdm
4
+ from lxml import etree
5
+ import pandas as pd
6
+ import requests
7
+ import time
8
+ import json
9
+ import re
10
+
11
+ class Weibo_Streaming(Social_Media_Downloader):
12
+ def __init__(self, args = {}):
13
+ super().__init__(args)
14
+ self.dataframe = pd.DataFrame()
15
+
16
+ def download_streaming_stock(self, stock = "茅台", rounds = 3):
17
+ for r in tqdm(range(rounds), desc="Downloading by page.."):
18
+ page = r+1
19
+ self._gather_one_page(page, stock)
20
+
21
+ def _gather_one_page(self,page, stock = "茅台", delay = 0.01):
22
+ headers = {
23
+ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
24
+ }
25
+ params = {
26
+ "containerid": f"100103type=61&q={stock}&t=",
27
+ "page_type": "searchall",
28
+ "page":page
29
+ }
30
+ url = f"https://m.weibo.cn/api/container/getIndex"
31
+ resp = self._request_get(url, headers=headers, params = params)
32
+
33
+ if resp is None:
34
+ return "Error"
35
+
36
+ res = json.loads(resp.text)
37
+ res = res["data"]["cards"]
38
+ res = pd.DataFrame(res)
39
+
40
+ pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...")
41
+ res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand")
42
+
43
+ self.dataframe = pd.concat([self.dataframe, res])
44
+
45
+ def _process_text(self,x, pbar, delay = 0.01):
46
+ text = x["mblog"]["text"]
47
+ text = etree.HTML(text)
48
+ content_short = text.xpath(".//text()")
49
+ content_short = ''.join(content_short)
50
+
51
+ link = text.xpath('.//a/@href')
52
+ link = [l for l in link if "status" in l ]
53
+ if len(link) >0:
54
+ base_url = "https://m.weibo.cn/"
55
+ url_new = base_url + link[0]
56
+ headers = {
57
+ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
58
+ }
59
+ resp = self._request_get(url_new, headers= headers)
60
+ if resp is None:
61
+ content = content_short
62
+ else:
63
+ res = etree.HTML(resp.content)
64
+ scripts = res.xpath('//script')
65
+ content = scripts[2].xpath("text()")
66
+ pattern=re.compile('"text": "(.+),\n')
67
+ result = pattern.findall(content[0])
68
+ content = etree.HTML(result[0])
69
+ content = content.xpath("//text()")
70
+ content = ''.join(content)
71
+ else:
72
+ content = content_short
73
+
74
+ pbar.update(1)
75
+ time.sleep(delay)
76
+
77
+ return content_short, content
78
+