File size: 4,050 Bytes
87010b2 14c27e2 f493631 0833807 87010b2 14c27e2 87010b2 14c27e2 87010b2 14c27e2 87010b2 14c27e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
import tweepy
from plotly.subplots import make_subplots
from transformers import pipeline
consumer_key = "sHz78Xj5Dl41cqfzEHVoRcaKo"
consumer_secret = "3y5caZfu91nmB2MNH7mDSu5Cgf5qaVRpMfbDoCPW4dU7E46k03"
access_key = "1116912581434695680-x359MscPSdqEcJzoIlg4jMsCZRdyNX"
access_secret = "wEsALFUava2TnYXWnuacrzSK4eiYfJUFLBRWPqGuMRnTz"
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_key,access_secret)
api = tweepy.API(auth)
def get_tweets(username, count):
tweets = tweepy.Cursor(
api.user_timeline,
screen_name=username,
tweet_mode="extended",
exclude_replies=True,
include_rts=False,
).items(count)
tweets = list(tweets)
response = {
"tweets": [tweet.full_text.replace("\n", "").lower() for tweet in tweets],
"timestamps": [str(tweet.created_at) for tweet in tweets],
"retweets": [tweet.retweet_count for tweet in tweets],
"likes": [tweet.favorite_count for tweet in tweets],
}
return response
def get_sentiment(texts):
preds = pipe(texts)
response = dict()
response["labels"] = [pred["label"] for pred in preds]
response["scores"] = [pred["score"] for pred in preds]
return response
def neutralise_sentiment(preds):
for i, (label, score) in enumerate(zip(preds["labels"], preds["scores"])):
if score < 0.5:
preds["labels"][i] = "neutral"
preds["scores"][i] = 1.0 - score
def get_aggregation_period(df):
t_min, t_max = df["timestamps"].min(), df["timestamps"].max()
t_delta = t_max - t_min
if t_delta < pd.to_timedelta("30D"):
return "1D"
elif t_delta < pd.to_timedelta("365D"):
return "7D"
else:
return "30D"
@st.cache(allow_output_mutation=True)
def load_model():
pipe = pipeline(task="sentiment-analysis", model="bhadresh-savani/distilbert-base-uncased-emotion")
return pipe
"""
# Twitter Emotion Analyser
"""
pipe = load_model()
twitter_handle = st.sidebar.text_input("Twitter handle:", "huggingface")
twitter_count = st.sidebar.selectbox("Number of tweets:", (10, 100, 500, 1000, 3200))
if st.sidebar.button("Get tweets!"):
tweets = get_tweets(twitter_handle, twitter_count)
preds = get_sentiment(tweets["tweets"])
# neutralise_sentiment(preds)
tweets.update(preds)
# dataframe creation + preprocessing
df = pd.DataFrame(tweets)
df["timestamps"] = pd.to_datetime(df["timestamps"])
# plots
agg_period = get_aggregation_period(df)
ts_sentiment = (
df.groupby(["timestamps", "labels"])
.count()["likes"]
.unstack()
.resample(agg_period)
.count()
.stack()
.reset_index()
)
ts_sentiment.columns = ["timestamp", "label", "count"]
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
# TODO: check that stacking makes sense!
for label in ts_sentiment["label"].unique():
fig.add_trace(
go.Scatter(
x=ts_sentiment.query("label == @label")["timestamp"],
y=ts_sentiment.query("label == @label")["count"],
mode="lines",
name=label,
stackgroup="one",
hoverinfo="x+y",
),
row=1,
col=1,
)
likes_per_label = df.groupby("labels")["likes"].mean().reset_index()
fig.add_trace(
go.Bar(
x=likes_per_label["labels"],
y=likes_per_label["likes"],
showlegend=False,
marker_color=px.colors.qualitative.Plotly,
opacity=0.6,
),
row=1,
col=2,
)
fig.update_yaxes(title_text="Number of Tweets", row=1, col=1)
fig.update_yaxes(title_text="Number of Likes", row=1, col=2)
fig.update_layout(height=350, width=750)
st.plotly_chart(fig)
# tweet sample
st.markdown(df.sample(n=5).to_markdown()) |