Spaces:
Runtime error
Runtime error
Commit
·
ed0a845
1
Parent(s):
0c48058
Upload 7 files
Browse files- finnlp/data_sources/social_media/_base.py +19 -0
- finnlp/data_sources/social_media/finnhub_sentiment.py +53 -0
- finnlp/data_sources/social_media/reddit_streaming.py +96 -0
- finnlp/data_sources/social_media/stocktwits_streaming.py +50 -0
- finnlp/data_sources/social_media/twitter_date_range.py +76 -0
- finnlp/data_sources/social_media/weibo_date_range.py +154 -0
- finnlp/data_sources/social_media/weibo_streaming.py +78 -0
finnlp/data_sources/social_media/_base.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources._base import FinNLP_Downloader
|
2 |
+
|
3 |
+
class Social_Media_Downloader(FinNLP_Downloader):
|
4 |
+
|
5 |
+
def __init__(self, args = {}):
|
6 |
+
super().__init__(args)
|
7 |
+
pass
|
8 |
+
|
9 |
+
def download(self, start_date, end_date, stock = "all"):
|
10 |
+
pass
|
11 |
+
|
12 |
+
def clean_data(self):
|
13 |
+
pass
|
14 |
+
|
15 |
+
def gather_one_day_news(self,date,stock = "all",delay = 0.1):
|
16 |
+
pass
|
17 |
+
|
18 |
+
def transfer_standard_date_to_nonstandard(self,date):
|
19 |
+
pass
|
finnlp/data_sources/social_media/finnhub_sentiment.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
2 |
+
from tqdm.notebook import tqdm
|
3 |
+
import pandas as pd
|
4 |
+
import finnhub
|
5 |
+
import time
|
6 |
+
|
7 |
+
class Finnhub_Sentiment(Social_Media_Downloader):
|
8 |
+
def __init__(self, args = {}):
|
9 |
+
super().__init__(args)
|
10 |
+
assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
|
11 |
+
self.finnhub_client = finnhub.Client(api_key=args["token"])
|
12 |
+
self.delay = args["delay"] if "dalay" in args.keys() else 0.7
|
13 |
+
|
14 |
+
def download_sentiment(self, start_date, end_date, stock = "APPL"):
|
15 |
+
self.reddit = pd.DataFrame()
|
16 |
+
self.twitter = pd.DataFrame()
|
17 |
+
self.date_list = pd.date_range(start_date,end_date)
|
18 |
+
days_each_time = 4
|
19 |
+
date_list = self.date_list
|
20 |
+
# cal total lenth
|
21 |
+
if len(date_list)%days_each_time == 0:
|
22 |
+
total = len(date_list)//days_each_time
|
23 |
+
else:
|
24 |
+
total = len(date_list)//days_each_time+1
|
25 |
+
with tqdm(total=total) as bar:
|
26 |
+
while len(date_list):
|
27 |
+
tmp_date_list = date_list[:days_each_time]
|
28 |
+
date_list = date_list[days_each_time:]
|
29 |
+
tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
|
30 |
+
tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
|
31 |
+
reddit, _stock_name, twitter = self.gather_one_day_sentiment(tmp_start_date,tmp_end_date,stock = stock )
|
32 |
+
self.reddit = pd.concat([self.reddit,reddit])
|
33 |
+
self.twitter = pd.concat([self.twitter,twitter])
|
34 |
+
bar.update(1)
|
35 |
+
self.reddit = self.reddit.sort_values("atTime")
|
36 |
+
self.twitter = self.twitter.sort_values("atTime")
|
37 |
+
|
38 |
+
def gather_one_day_sentiment(self,start_date, end_date, stock = "APPL"):
|
39 |
+
res = self.finnhub_client.stock_social_sentiment(stock, _from=start_date, to=end_date)
|
40 |
+
reddit = res["reddit"]
|
41 |
+
symbol = res["symbol"]
|
42 |
+
twitter = res["twitter"]
|
43 |
+
reddit = pd.DataFrame(reddit)
|
44 |
+
# print(reddit)
|
45 |
+
|
46 |
+
twitter = pd.DataFrame(twitter)
|
47 |
+
try:
|
48 |
+
reddit["atTime"] = pd.to_datetime(reddit["atTime"],errors = "ignore")
|
49 |
+
twitter["atTime"] = pd.to_datetime(twitter["atTime"],errors = "ignore")
|
50 |
+
except:
|
51 |
+
pass
|
52 |
+
time.sleep(self.delay)
|
53 |
+
return reddit,symbol,twitter
|
finnlp/data_sources/social_media/reddit_streaming.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
from lxml import etree
|
5 |
+
import requests
|
6 |
+
import pandas as pd
|
7 |
+
import json
|
8 |
+
import base64
|
9 |
+
|
10 |
+
class Reddit_Streaming(Social_Media_Downloader):
|
11 |
+
|
12 |
+
def __init__(self, args = {}):
|
13 |
+
super().__init__(args)
|
14 |
+
self.dataframe = pd.DataFrame()
|
15 |
+
|
16 |
+
def download_streaming_all(self, rounds = 3):
|
17 |
+
# Download the first page by url
|
18 |
+
base_url = "https://www.reddit.com/r/wallstreetbets/new/"
|
19 |
+
pbar = tqdm(total= rounds, desc= "Downloading by pages...")
|
20 |
+
res = self._request_get(base_url)
|
21 |
+
if res is None:
|
22 |
+
raise ConnectionError
|
23 |
+
|
24 |
+
# get the info from init page
|
25 |
+
html = etree.HTML(res.text)
|
26 |
+
init = html.xpath("//*[@id='data']/text()")[0]
|
27 |
+
init = json.loads(init[14:][:-1])
|
28 |
+
init = init["posts"]["models"]
|
29 |
+
tmp_df = pd.DataFrame(init).T.reset_index(drop = True)
|
30 |
+
self.dataframe = tmp_df
|
31 |
+
init = [i for i in init if len(i)< 12]
|
32 |
+
last_id = init[-1]
|
33 |
+
last_id = self._encode_base64(last_id)
|
34 |
+
|
35 |
+
pbar.update(1)
|
36 |
+
|
37 |
+
# fetch other pages
|
38 |
+
if rounds > 1:
|
39 |
+
for _ in range(1,rounds):
|
40 |
+
last_id = self._fatch_other_pages(last_id, pbar)
|
41 |
+
|
42 |
+
def _fatch_other_pages(self, last_page, pbar):
|
43 |
+
url = 'https://gql.reddit.com/'
|
44 |
+
headers = {
|
45 |
+
"referer":"https://www.reddit.com/",
|
46 |
+
"authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA",
|
47 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
|
48 |
+
}
|
49 |
+
data = {
|
50 |
+
"id": "02e3b6d0d0d7",
|
51 |
+
"variables": {
|
52 |
+
"name": "wallstreetbets",
|
53 |
+
"includeIdentity": False,
|
54 |
+
"adContext": {
|
55 |
+
"layout": "CARD",
|
56 |
+
"clientSignalSessionData": {
|
57 |
+
"adsSeenCount": 4,
|
58 |
+
"totalPostsSeenCount": 79,
|
59 |
+
"sessionStartTime": "2023-04-07T15:32:13.933Z",
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"isFake": False,
|
63 |
+
"includeAppliedFlair": False,
|
64 |
+
"includeDevPlatformMetadata": True,
|
65 |
+
"includeRecents": False,
|
66 |
+
"includeTrending": False,
|
67 |
+
"includeSubredditRankings": True,
|
68 |
+
"includeSubredditChannels": False,
|
69 |
+
"isAdHocMulti": False,
|
70 |
+
"isAll": False,
|
71 |
+
"isLoggedOutGatedOptedin": False,
|
72 |
+
"isLoggedOutQuarantineOptedin": False,
|
73 |
+
"isPopular": False,
|
74 |
+
"recentPostIds": [],
|
75 |
+
"subredditNames": [],
|
76 |
+
"sort": "NEW",
|
77 |
+
"pageSize": 25,
|
78 |
+
"after": last_page
|
79 |
+
}
|
80 |
+
}
|
81 |
+
response = self._request_post(url = url, headers= headers, json = data)
|
82 |
+
data = json.loads(response.text)
|
83 |
+
data = data["data"]["subredditInfoByName"]["elements"]["edges"]
|
84 |
+
for d in data:
|
85 |
+
if d["node"]["__typename"] == "SubredditPost":
|
86 |
+
tmp = pd.DataFrame(d).T
|
87 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
88 |
+
last_id = tmp.id.values[0]
|
89 |
+
|
90 |
+
last_id = self._encode_base64(last_id)
|
91 |
+
pbar.update(1)
|
92 |
+
|
93 |
+
return last_id
|
94 |
+
|
95 |
+
def _encode_base64(self,id):
|
96 |
+
return base64.b64encode(id.encode('utf-8')).decode()
|
finnlp/data_sources/social_media/stocktwits_streaming.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import pandas as pd
|
5 |
+
from tqdm import tqdm
|
6 |
+
import json
|
7 |
+
|
8 |
+
class Stocktwits_Streaming(Social_Media_Downloader):
|
9 |
+
|
10 |
+
def __init__(self, args = {}):
|
11 |
+
super().__init__(args)
|
12 |
+
self.dataframe = pd.DataFrame()
|
13 |
+
|
14 |
+
def download_streaming_stock(self, stock = "AAPL", rounds = 3):
|
15 |
+
url = f"https://api.stocktwits.com/api/2/streams/symbol/{stock}.json"
|
16 |
+
headers = {
|
17 |
+
'accept': 'application/json',
|
18 |
+
'accept-encoding': 'gzip, deflate, br',
|
19 |
+
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
20 |
+
'authorization': 'OAuth 8a881f43cbc7af061ec2aa35deec9b44f7e3cc09',
|
21 |
+
'dnt': '1',
|
22 |
+
'origin': 'https://stocktwits.com',
|
23 |
+
'referer': 'https://stocktwits.com/',
|
24 |
+
|
25 |
+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
|
26 |
+
}
|
27 |
+
for i in tqdm(range(rounds)):
|
28 |
+
if i == 0:
|
29 |
+
params = {
|
30 |
+
"filter":"top",
|
31 |
+
"limit":1000,
|
32 |
+
# "max":410000000,
|
33 |
+
}
|
34 |
+
else:
|
35 |
+
params = {
|
36 |
+
"filter":"top",
|
37 |
+
"limit":1000,
|
38 |
+
"max":max,
|
39 |
+
}
|
40 |
+
response = self._request_get(url = url, headers=headers, params=params)
|
41 |
+
if response is None:
|
42 |
+
print(f"Fetch data fail. Please check your stock name :{stock} and connections. You may raise an issue if you can't solve this problem")
|
43 |
+
continue
|
44 |
+
else:
|
45 |
+
res = json.loads(response.text)
|
46 |
+
max = res["cursor"]["since"]
|
47 |
+
res = pd.DataFrame(res["messages"])
|
48 |
+
self.dataframe = pd.concat([self.dataframe,res])
|
49 |
+
|
50 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
finnlp/data_sources/social_media/twitter_date_range.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
warnings.filterwarnings("ignore")
|
3 |
+
|
4 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
5 |
+
|
6 |
+
import requests
|
7 |
+
from urllib import parse
|
8 |
+
from tqdm import tqdm
|
9 |
+
from datetime import datetime,timedelta
|
10 |
+
import pandas as pd
|
11 |
+
import json
|
12 |
+
import time
|
13 |
+
|
14 |
+
class Twitter_Date_Range(Social_Media_Downloader):
|
15 |
+
|
16 |
+
def __init__(self, args = {}):
|
17 |
+
super().__init__(args)
|
18 |
+
self.dataframe = pd.DataFrame()
|
19 |
+
|
20 |
+
def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
|
21 |
+
self.date_list = pd.date_range(start_date,end_date)
|
22 |
+
res = pd.DataFrame()
|
23 |
+
for date in tqdm(self.date_list, desc= "Downloading by day... "):
|
24 |
+
tmp = self._gather_one_day(date,stock)
|
25 |
+
res = pd.concat([res,tmp])
|
26 |
+
|
27 |
+
res.created_at = pd.to_datetime(res.created_at)
|
28 |
+
res = res.sort_values("created_at")
|
29 |
+
res = res.reset_index(drop=True)
|
30 |
+
# res = res.query(f"created_at >= @start_date & created_at <= @end_date")
|
31 |
+
res = res[res.created_at >= start_date][res.created_at <= end_date]
|
32 |
+
res = res.reset_index(drop=True)
|
33 |
+
self.dataframe = res
|
34 |
+
|
35 |
+
def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01):
|
36 |
+
time.sleep(delay)
|
37 |
+
next_date = date + timedelta(days=1)
|
38 |
+
date = datetime.strftime(date, "%Y-%m-%d")
|
39 |
+
next_date = datetime.strftime(next_date, "%Y-%m-%d")
|
40 |
+
|
41 |
+
url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo"
|
42 |
+
url_token = 'https://api.twitter.com/1.1/guest/activate.json'
|
43 |
+
headers = {
|
44 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
|
45 |
+
'Accept': '*/*',
|
46 |
+
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
47 |
+
'x-guest-token': '',
|
48 |
+
'x-twitter-client-language': 'zh-cn',
|
49 |
+
'x-twitter-active-user': 'yes',
|
50 |
+
'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733',
|
51 |
+
'Sec-Fetch-Dest': 'empty',
|
52 |
+
'Sec-Fetch-Mode': 'cors',
|
53 |
+
'Sec-Fetch-Site': 'same-origin',
|
54 |
+
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
|
55 |
+
'Referer': 'https://twitter.com/',
|
56 |
+
'Connection': 'keep-alive',
|
57 |
+
}
|
58 |
+
|
59 |
+
q = f'{stock} until:{next_date} since:{date}'
|
60 |
+
token = json.loads(requests.post(url_token, headers = headers).text)['guest_token']
|
61 |
+
print(token)
|
62 |
+
headers['x-guest-token'] = token
|
63 |
+
url = url.format(parse.quote(q))
|
64 |
+
print(url)
|
65 |
+
res = self._request_get(url, headers = headers)
|
66 |
+
print(res)
|
67 |
+
if res is not None:
|
68 |
+
try:
|
69 |
+
res = json.loads(res.text)
|
70 |
+
res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at")
|
71 |
+
except:
|
72 |
+
res = pd.DataFrame()
|
73 |
+
else:
|
74 |
+
res = pd.DataFrame()
|
75 |
+
|
76 |
+
return res
|
finnlp/data_sources/social_media/weibo_date_range.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
from lxml import etree
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import requests
|
8 |
+
import datetime
|
9 |
+
import time
|
10 |
+
import json
|
11 |
+
import re
|
12 |
+
|
13 |
+
class Weibo_Date_Range(Social_Media_Downloader):
|
14 |
+
def __init__(self, args = {}):
|
15 |
+
super().__init__(args)
|
16 |
+
if "cookies" not in args.keys():
|
17 |
+
raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
|
18 |
+
self.cookies = args["cookies"]
|
19 |
+
self.dataframe = pd.DataFrame()
|
20 |
+
|
21 |
+
def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01):
|
22 |
+
self.date_list = pd.date_range(start_date, end_date)
|
23 |
+
for date in tqdm(self.date_list, desc = "Downloading by dates..."):
|
24 |
+
date = date.strftime("%Y-%m-%d")
|
25 |
+
self._gather_one_day(date, start_hour, end_hour, stock, delay)
|
26 |
+
self.dataframe = self.dataframe.reset_index(drop = True)
|
27 |
+
|
28 |
+
def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01):
|
29 |
+
if start_hour == 0 and end_hour == 0:
|
30 |
+
start_date = datetime.datetime.strptime(date, "%Y-%m-%d")
|
31 |
+
end_date = start_date + datetime.timedelta(days=1)
|
32 |
+
start_date = start_date.strftime("%Y-%m-%d")
|
33 |
+
end_date = end_date.strftime("%Y-%m-%d")
|
34 |
+
else:
|
35 |
+
start_date = date, end_date = date
|
36 |
+
|
37 |
+
# first page
|
38 |
+
all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay)
|
39 |
+
# another pages
|
40 |
+
if len(all_urls)>1:
|
41 |
+
base_url= "https://s.weibo.com/"
|
42 |
+
for url_new in all_urls:
|
43 |
+
url_new = base_url + url_new
|
44 |
+
self._gather_other_pages(date, url_new, delay)
|
45 |
+
|
46 |
+
def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01):
|
47 |
+
|
48 |
+
headers = {
|
49 |
+
"cookie": self.cookies,
|
50 |
+
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
51 |
+
}
|
52 |
+
|
53 |
+
params = {
|
54 |
+
"q": stock,
|
55 |
+
"typeall": "1",
|
56 |
+
"suball": "1",
|
57 |
+
"timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}",
|
58 |
+
"Refer":"g",
|
59 |
+
"page":"1"
|
60 |
+
}
|
61 |
+
|
62 |
+
url = f"https://s.weibo.com/weibo"
|
63 |
+
resp = self._request_get(url, headers=headers, params = params)
|
64 |
+
|
65 |
+
if resp is None:
|
66 |
+
return "Error"
|
67 |
+
|
68 |
+
if "passport.weibo.com" in resp.url:
|
69 |
+
raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
|
70 |
+
|
71 |
+
res = etree.HTML(resp.content)
|
72 |
+
# get all pages
|
73 |
+
all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
|
74 |
+
items = res.xpath('//div[@class="card-wrap"]')
|
75 |
+
for i in items:
|
76 |
+
ps = i.xpath('.//div[@class="content"]//p')
|
77 |
+
try:
|
78 |
+
content = ps[0].xpath(".//text()")
|
79 |
+
content = ''.join(content)
|
80 |
+
content = content.replace('\n',"")
|
81 |
+
content = content.replace(' ',"")
|
82 |
+
content = content.replace('\u200b',"")
|
83 |
+
except:
|
84 |
+
continue
|
85 |
+
|
86 |
+
info = ps[1].xpath(".//text()")
|
87 |
+
try:
|
88 |
+
date_content = info[1]
|
89 |
+
date_content = date_content.replace('\n',"")
|
90 |
+
date_content = date_content.replace(' ',"")
|
91 |
+
except:
|
92 |
+
date_content = np.nan
|
93 |
+
|
94 |
+
try:
|
95 |
+
source = info[3]
|
96 |
+
except:
|
97 |
+
source = np.nan
|
98 |
+
|
99 |
+
tmp = pd.DataFrame([start_date, date_content, source, content]).T
|
100 |
+
tmp.columns = ["date","date_content", "source", "content"]
|
101 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
102 |
+
|
103 |
+
time.sleep(delay)
|
104 |
+
|
105 |
+
return all_pages
|
106 |
+
|
107 |
+
def _gather_other_pages(self, date, url, delay = 0.01):
|
108 |
+
|
109 |
+
headers = {
|
110 |
+
"cookie": self.cookies,
|
111 |
+
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
112 |
+
}
|
113 |
+
|
114 |
+
resp = self._request_get(url, headers=headers)
|
115 |
+
|
116 |
+
if resp is None:
|
117 |
+
return "Error"
|
118 |
+
|
119 |
+
if "passport.weibo.com" in resp.url:
|
120 |
+
raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ")
|
121 |
+
|
122 |
+
res = etree.HTML(resp.content)
|
123 |
+
# get all pages
|
124 |
+
all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href')
|
125 |
+
items = res.xpath('//div[@class="card-wrap"]')
|
126 |
+
for i in items:
|
127 |
+
ps = i.xpath('.//div[@class="content"]//p')
|
128 |
+
try:
|
129 |
+
content = ps[0].xpath(".//text()")
|
130 |
+
content = ''.join(content)
|
131 |
+
content = content.replace('\n',"")
|
132 |
+
content = content.replace(' ',"")
|
133 |
+
content = content.replace('\u200b',"")
|
134 |
+
except:
|
135 |
+
continue
|
136 |
+
|
137 |
+
info = ps[1].xpath(".//text()")
|
138 |
+
try:
|
139 |
+
date_content = info[1]
|
140 |
+
date_content = date_content.replace('\n',"")
|
141 |
+
date_content = date_content.replace(' ',"")
|
142 |
+
except:
|
143 |
+
date_content = np.nan
|
144 |
+
|
145 |
+
try:
|
146 |
+
source = info[3]
|
147 |
+
except:
|
148 |
+
source = np.nan
|
149 |
+
|
150 |
+
tmp = pd.DataFrame([date, date_content, source, content]).T
|
151 |
+
tmp.columns = ["date", "date_content", "source", "content"]
|
152 |
+
self.dataframe = pd.concat([self.dataframe, tmp])
|
153 |
+
|
154 |
+
time.sleep(delay)
|
finnlp/data_sources/social_media/weibo_streaming.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from finnlp.data_sources.social_media._base import Social_Media_Downloader
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
from lxml import etree
|
5 |
+
import pandas as pd
|
6 |
+
import requests
|
7 |
+
import time
|
8 |
+
import json
|
9 |
+
import re
|
10 |
+
|
11 |
+
class Weibo_Streaming(Social_Media_Downloader):
|
12 |
+
def __init__(self, args = {}):
|
13 |
+
super().__init__(args)
|
14 |
+
self.dataframe = pd.DataFrame()
|
15 |
+
|
16 |
+
def download_streaming_stock(self, stock = "茅台", rounds = 3):
|
17 |
+
for r in tqdm(range(rounds), desc="Downloading by page.."):
|
18 |
+
page = r+1
|
19 |
+
self._gather_one_page(page, stock)
|
20 |
+
|
21 |
+
def _gather_one_page(self,page, stock = "茅台", delay = 0.01):
|
22 |
+
headers = {
|
23 |
+
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
|
24 |
+
}
|
25 |
+
params = {
|
26 |
+
"containerid": f"100103type=61&q={stock}&t=",
|
27 |
+
"page_type": "searchall",
|
28 |
+
"page":page
|
29 |
+
}
|
30 |
+
url = f"https://m.weibo.cn/api/container/getIndex"
|
31 |
+
resp = self._request_get(url, headers=headers, params = params)
|
32 |
+
|
33 |
+
if resp is None:
|
34 |
+
return "Error"
|
35 |
+
|
36 |
+
res = json.loads(resp.text)
|
37 |
+
res = res["data"]["cards"]
|
38 |
+
res = pd.DataFrame(res)
|
39 |
+
|
40 |
+
pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...")
|
41 |
+
res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand")
|
42 |
+
|
43 |
+
self.dataframe = pd.concat([self.dataframe, res])
|
44 |
+
|
45 |
+
def _process_text(self,x, pbar, delay = 0.01):
|
46 |
+
text = x["mblog"]["text"]
|
47 |
+
text = etree.HTML(text)
|
48 |
+
content_short = text.xpath(".//text()")
|
49 |
+
content_short = ''.join(content_short)
|
50 |
+
|
51 |
+
link = text.xpath('.//a/@href')
|
52 |
+
link = [l for l in link if "status" in l ]
|
53 |
+
if len(link) >0:
|
54 |
+
base_url = "https://m.weibo.cn/"
|
55 |
+
url_new = base_url + link[0]
|
56 |
+
headers = {
|
57 |
+
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
|
58 |
+
}
|
59 |
+
resp = self._request_get(url_new, headers= headers)
|
60 |
+
if resp is None:
|
61 |
+
content = content_short
|
62 |
+
else:
|
63 |
+
res = etree.HTML(resp.content)
|
64 |
+
scripts = res.xpath('//script')
|
65 |
+
content = scripts[2].xpath("text()")
|
66 |
+
pattern=re.compile('"text": "(.+),\n')
|
67 |
+
result = pattern.findall(content[0])
|
68 |
+
content = etree.HTML(result[0])
|
69 |
+
content = content.xpath("//text()")
|
70 |
+
content = ''.join(content)
|
71 |
+
else:
|
72 |
+
content = content_short
|
73 |
+
|
74 |
+
pbar.update(1)
|
75 |
+
time.sleep(delay)
|
76 |
+
|
77 |
+
return content_short, content
|
78 |
+
|