Spaces:
Runtime error
Runtime error
Commit
·
3efdb8d
1
Parent(s):
6920130
Upload 5 files
Browse files- app.py +42 -0
- extract_tweets.py +50 -0
- inference.py +113 -0
- sampling.py +34 -0
- sen_model.py +13 -0
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from inference import Keyword_oracle
|
3 |
+
from datetime import date
|
4 |
+
from extract_tweets import extract_tweets
|
5 |
+
import torch
|
6 |
+
import gc
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
|
10 |
+
header = st.container()
|
11 |
+
get_tweet= st.container()
|
12 |
+
features= st.container()
|
13 |
+
modelTraining = st.container()
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
with get_tweet:
|
18 |
+
Path('sheets/').mkdir(exist_ok=True)
|
19 |
+
st.header("Place the topic you want to research on Twitter :bird:")
|
20 |
+
input_keyword = st.text_input('Write the keyword:')
|
21 |
+
if input_keyword:
|
22 |
+
current_date = date.today()
|
23 |
+
data_since = st.date_input('from which date:',current_date)
|
24 |
+
data_until = st.date_input('until which date:',current_date)
|
25 |
+
max_kw = st.slider('maximum words per keyword', 1, 3, 1)
|
26 |
+
st.text('This process may take a few seconds')
|
27 |
+
st.text(f'plot of the keywords asociated with the topic {input_keyword}:')
|
28 |
+
extract_tweets(input_keyword,data_since,data_until)
|
29 |
+
oracle = Keyword_oracle(input_keyword,
|
30 |
+
keyphrase_ngram_range = (1,max_kw),
|
31 |
+
diversity=0.3,top_n=3)
|
32 |
+
st.pyplot(oracle.plot())
|
33 |
+
st.text("Table of the most popular keywords")
|
34 |
+
table = oracle.return_table()
|
35 |
+
st.dataframe(table)
|
36 |
+
st.download_button(
|
37 |
+
label="Download data as CSV",
|
38 |
+
data= table.to_csv().encode('utf-8'),
|
39 |
+
file_name= f'{input_keyword}.csv',
|
40 |
+
mime='text/csv',
|
41 |
+
)
|
42 |
+
del oracle
|
extract_tweets.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tweepy
|
2 |
+
import math
|
3 |
+
import pandas as pd
|
4 |
+
from tweepy import Client
|
5 |
+
from openpyxl import load_workbook
|
6 |
+
import re
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
api_key = st.secrets['api_key']
|
10 |
+
api_key_secret = st.secrets['api_key_secret']
|
11 |
+
access_token = st.secrets['access_token']
|
12 |
+
access_token_secret = st.secrets['access_token_secret']
|
13 |
+
|
14 |
+
auth = tweepy.OAuthHandler(api_key,api_key_secret)
|
15 |
+
auth.set_access_token(access_token,access_token_secret)
|
16 |
+
|
17 |
+
api = tweepy.API(auth)
|
18 |
+
|
19 |
+
def preprocess(tweets):
|
20 |
+
proccesed_tweets = []
|
21 |
+
for tweet in tweets.split():
|
22 |
+
tweet = '@user' if tweet.startswith('@') and len(tweet) > 1 else tweet
|
23 |
+
tweet = 'http' if tweet.startswith('http') else tweet
|
24 |
+
proccesed_tweets.append(tweet)
|
25 |
+
return " ".join(proccesed_tweets)
|
26 |
+
|
27 |
+
|
28 |
+
def extract_tweets(words,date_since,date_until,num_tweets=300):
|
29 |
+
tweets = tweepy.Cursor(
|
30 |
+
api.search_tweets,
|
31 |
+
words, lang="en",
|
32 |
+
since_id=date_since,
|
33 |
+
until=date_until,
|
34 |
+
tweet_mode='extended').items(num_tweets)
|
35 |
+
tweet_cont,tweet_rt,tweet_heart=[],[],[]
|
36 |
+
for tweet in tweets:
|
37 |
+
try:
|
38 |
+
tweet_cont.append(preprocess(tweet.full_text))
|
39 |
+
tweet_rt.append(tweet.retweet_count)
|
40 |
+
tweet_heart.append(tweet.retweeted_status.favorite_count)
|
41 |
+
except AttributeError:
|
42 |
+
tweet_heart.append(0)
|
43 |
+
data = {
|
44 |
+
'Tweet': tweet_cont,
|
45 |
+
'Retweet': tweet_rt,
|
46 |
+
'Favs':tweet_heart
|
47 |
+
}
|
48 |
+
df = pd.DataFrame(data)
|
49 |
+
with pd.ExcelWriter(f'sheets/{words}.xlsx') as writer:
|
50 |
+
df.to_excel(writer)
|
inference.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from keybert import KeyBERT
|
2 |
+
from sen_model import Sentiment
|
3 |
+
from sampling import sampling_inference
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import mplcyberpunk
|
9 |
+
from adjustText import adjust_text
|
10 |
+
class Keyword_oracle():
|
11 |
+
def __init__(self,file_name,
|
12 |
+
weight_rt_fav = [1,4],
|
13 |
+
noise_threshold = 75,
|
14 |
+
words_exp = ["user","http","rt","fav",'https'],
|
15 |
+
**kwargs
|
16 |
+
):
|
17 |
+
self.key_bert = KeyBERT()
|
18 |
+
self.file_name = file_name
|
19 |
+
self.keybert_args = kwargs
|
20 |
+
self.weight_rt_fav = weight_rt_fav
|
21 |
+
self.raw_tweets= sampling_inference(file_name).sampled_df()
|
22 |
+
self.noise_threshold = noise_threshold if kwargs['top_n'] == 1 else 90 if kwargs['top_n'] == 2 else 95
|
23 |
+
self.tweets = self.raw_tweets['Tweet']
|
24 |
+
self.retweet = self.raw_tweets['Retweet']
|
25 |
+
self.favs = self.raw_tweets['Favs']
|
26 |
+
self.sentiment_eval = self.__sentimient_eval__()
|
27 |
+
self.words_exp = words_exp
|
28 |
+
self.mined_tweets = self.__tweets_mined__()
|
29 |
+
self.denoised_df = self.__denoised_df__()
|
30 |
+
self.percentiles = self.__find_threshold__()
|
31 |
+
self.categorical = self.__categorical__()
|
32 |
+
|
33 |
+
def __sentimient_eval__(self):
|
34 |
+
return Sentiment(self.tweets)
|
35 |
+
|
36 |
+
def __tweets_mined__(self):
|
37 |
+
raw_keywords = self.key_bert.extract_keywords(self.tweets,
|
38 |
+
keyphrase_ngram_range = self.keybert_args['keyphrase_ngram_range'],
|
39 |
+
diversity = self.keybert_args['diversity'],
|
40 |
+
top_n = self.keybert_args['top_n']
|
41 |
+
)
|
42 |
+
key_words,engagement,acum_sents = [],[],[]
|
43 |
+
for keys,retweet,fav,sent in zip(raw_keywords,self.retweet,self.favs,self.sentiment_eval):
|
44 |
+
for key in keys:
|
45 |
+
if not set(key[0].split()).intersection(set(self.words_exp)):
|
46 |
+
key_words.append(key[0])
|
47 |
+
engagement.append(1+retweet/self.weight_rt_fav[0]+fav/self.weight_rt_fav[1])
|
48 |
+
acum_sents.append(sent+retweet/self.weight_rt_fav[0]*(sent)+fav/self.weight_rt_fav[1]*sent)
|
49 |
+
key_word_data = {
|
50 |
+
"Key": key_words,
|
51 |
+
'engagement': engagement,
|
52 |
+
'emotions overall':acum_sents
|
53 |
+
}
|
54 |
+
return pd.DataFrame(key_word_data).groupby(['Key'], as_index=False).sum()
|
55 |
+
|
56 |
+
|
57 |
+
def __denoised_df__(self):
|
58 |
+
df = self.mined_tweets
|
59 |
+
tweets = df['engagement']
|
60 |
+
percentile = np.percentile(tweets, self.noise_threshold)
|
61 |
+
return df[tweets > percentile].reset_index(drop=True)
|
62 |
+
|
63 |
+
def __find_threshold__(self):
|
64 |
+
df = self.mined_tweets
|
65 |
+
tweets = df['emotions overall']
|
66 |
+
top_threshold = self.noise_threshold
|
67 |
+
bottom_threshold = 100-top_threshold
|
68 |
+
while np.percentile(tweets,top_threshold) <= 0 and np.percentile(tweets,100-top_threshold):
|
69 |
+
try:
|
70 |
+
top_threshold +=5
|
71 |
+
bottom_threshold -= 5
|
72 |
+
except top_threshold == 95:
|
73 |
+
top_threshold,bottom_threshold = 0,0
|
74 |
+
bottom_threshold,top_threshold = np.percentile(tweets,bottom_threshold),np.percentile(tweets,top_threshold)
|
75 |
+
return bottom_threshold,top_threshold
|
76 |
+
def __categorical__(self):
|
77 |
+
df = self.denoised_df
|
78 |
+
tweets = df['emotions overall'].to_numpy()
|
79 |
+
categorical = ['neutral','positive','negative']
|
80 |
+
bottom_threshold,top_threshold = self.percentiles
|
81 |
+
pos = (tweets >= top_threshold) if top_threshold > 0 else np.zeros(tweets.shape[0])
|
82 |
+
neg = (tweets <= bottom_threshold)*-1 if bottom_threshold < 0 else np.zeros(tweets.shape[0])
|
83 |
+
numerical = pos+neg
|
84 |
+
return [categorical[index] for index in numerical.astype(int)]
|
85 |
+
|
86 |
+
def return_table(self):
|
87 |
+
self.denoised_df['Categorical'] = self.__categorical__()
|
88 |
+
return self.denoised_df.sort_values(by=['emotions overall'],ascending = False).reset_index(drop=True)
|
89 |
+
|
90 |
+
def plot(self):
|
91 |
+
df = self.denoised_df
|
92 |
+
plt.style.use("cyberpunk")
|
93 |
+
keys = df['Key']
|
94 |
+
x,y = df['engagement'],df['emotions overall']
|
95 |
+
fig, ax = plt.subplots()
|
96 |
+
ax.scatter(x, y)
|
97 |
+
text = [plt.text(x_value,y_value,key_value) for x_value,y_value,key_value in zip(x,y,keys)]
|
98 |
+
adjust_text(text)
|
99 |
+
bottom_threshold,top_threshold = self.percentiles
|
100 |
+
plt.axhline(bottom_threshold ,c= "red", marker='.', linestyle=':') if bottom_threshold < 0 else None
|
101 |
+
plt.axhline(top_threshold,c= "magenta", marker='.', linestyle=':') if top_threshold > 0 else None
|
102 |
+
plt.title(f"Denoised sentiment analysis of {self.file_name}")
|
103 |
+
plt.xlabel("Engagement")
|
104 |
+
plt.ylabel("Emotions Overall")
|
105 |
+
return fig
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
file_name ='Graham Potter'
|
109 |
+
Keyword_oracle = Keyword_oracle(file_name,
|
110 |
+
keyphrase_ngram_range = (1,2),
|
111 |
+
diversity=0.3,top_n=3)
|
112 |
+
Keyword_oracle.plot()
|
113 |
+
print(Keyword_oracle.return_table())
|
sampling.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import scipy.stats
|
5 |
+
np.set_printoptions(suppress=True)
|
6 |
+
|
7 |
+
class sampling_inference():
|
8 |
+
def __init__(self,file_name,weight = [1,4]):
|
9 |
+
self.raw_tweets= pd.read_excel(f"sheets/{file_name}.xlsx")
|
10 |
+
self.weight = weight
|
11 |
+
self.engagement = self.__engagement__()
|
12 |
+
self.perc = self.__eval_perc__()
|
13 |
+
self.perc_thres = np.percentile(self.engagement,self.perc)
|
14 |
+
def __engagement__(self):
|
15 |
+
tweets = self.raw_tweets['Tweet']
|
16 |
+
raw_retweets = self.raw_tweets['Retweet'].to_numpy()
|
17 |
+
raw_favs = self.raw_tweets['Favs'].to_numpy()
|
18 |
+
engagement = raw_retweets/self.weight[0]+raw_favs/self.weight[1]
|
19 |
+
return engagement
|
20 |
+
def __eval_perc__(self,perc=75):
|
21 |
+
engagement = self.engagement
|
22 |
+
while np.percentile(engagement,perc) == 0 and perc < 95:
|
23 |
+
perc += 5
|
24 |
+
return perc
|
25 |
+
def sampled_df(self):
|
26 |
+
engagement = self.engagement
|
27 |
+
above_perc = np.where(self.engagement >= self.perc_thres)[0]
|
28 |
+
bellow_perc = np.where(self.engagement < self.perc_thres)[0].tolist()
|
29 |
+
bellow_perc = np.array(random.sample(bellow_perc,above_perc.shape[0]))
|
30 |
+
sampled_rows = np.concatenate((above_perc,bellow_perc))
|
31 |
+
sampled_df= self.raw_tweets.loc[sampled_rows].reset_index(drop=True)
|
32 |
+
del sampled_df['Unnamed: 0']
|
33 |
+
return sampled_df
|
34 |
+
|
sen_model.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
specific_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
|
5 |
+
|
6 |
+
def Sentiment(tweets):
|
7 |
+
output_model = specific_model(tweets.tolist())
|
8 |
+
labels = ["NEG","NEU","POS"]
|
9 |
+
idx = []
|
10 |
+
for output in output_model:
|
11 |
+
idx.append(labels.index(output["label"])-1)
|
12 |
+
return np.array(idx)
|
13 |
+
|