File size: 4,025 Bytes
87010b2 adf9913 87010b2 14c27e2 8cdafe3 0833807 7b13c08 f52e3c9 87010b2 43f117b 87010b2 14c27e2 87010b2 73176c6 87010b2 14c27e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
import tweepy
import os
from plotly.subplots import make_subplots
from transformers import pipeline
consumer_key = "sHz78Xj5Dl41cqfzEHVoRcaKo"
consumer_secret = "3y5caZfu91nmB2MNH7mDSu5Cgf5qaVRpMfbDoCPW4dU7E46k03"
access_key = "1116912581434695680-x359MscPSdqEcJzoIlg4jMsCZRdyNX"
access_secret = "wEsALFUava2TnYXWnuacrzSK4eiYfJUFLBRWPqGuMRnTz"
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_key,access_secret)
api = tweepy.API(auth)
def get_tweets(username, count):
tweets = tweepy.Cursor(
api.user_timeline,
screen_name=username,
tweet_mode="extended",
exclude_replies=True,
include_rts=False,
).items(count)
tweets = list(tweets)
response = {
"tweets": [tweet.full_text.replace("\n", "").lower() for tweet in tweets],
"timestamps": [str(tweet.created_at) for tweet in tweets],
"retweets": [tweet.retweet_count for tweet in tweets],
"likes": [tweet.favorite_count for tweet in tweets],
}
return response
def get_sentiment(texts):
preds = pipe(texts)
response = dict()
response["labels"] = [pred["label"] for pred in preds]
response["scores"] = [pred["score"] for pred in preds]
return response
def neutralise_sentiment(preds):
for i, (label, score) in enumerate(zip(preds["labels"], preds["scores"])):
if score < 0.5:
preds["labels"][i] = "neutral"
preds["scores"][i] = 1.0 - score
def get_aggregation_period(df):
t_min, t_max = df["timestamps"].min(), df["timestamps"].max()
t_delta = t_max - t_min
if t_delta < pd.to_timedelta("30D"):
return "1D"
elif t_delta < pd.to_timedelta("365D"):
return "7D"
else:
return "30D"
@st.cache_data
def load_model():
pipe = pipeline(task="sentiment-analysis", model="bhadresh-savani/distilbert-base-uncased-emotion")
return pipe
"""
# Twitter Emotion Analyser
"""
pipe = load_model()
twitter_handle = st.sidebar.text_input("Twitter handle:", "elonmusk")
twitter_count = st.sidebar.selectbox("Number of tweets:", (10, 30, 50, 100))
if st.sidebar.button("Get tweets!"):
tweets = get_tweets(twitter_handle, twitter_count)
preds = get_sentiment(tweets["tweets"])
# neutralise_sentiment(preds)
tweets.update(preds)
# dataframe creation + preprocessing
df = pd.DataFrame(tweets)
df["timestamps"] = pd.to_datetime(df["timestamps"])
# plots
agg_period = get_aggregation_period(df)
ts_sentiment = (
df.groupby(["timestamps", "labels"])
.count()["likes"]
.unstack()
.resample(agg_period)
.count()
.stack()
.reset_index()
)
ts_sentiment.columns = ["timestamp", "label", "count"]
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
# TODO: check that stacking makes sense!
for label in ts_sentiment["label"].unique():
fig.add_trace(
go.Scatter(
x=ts_sentiment.query("label == @label")["timestamp"],
y=ts_sentiment.query("label == @label")["count"],
mode="lines",
name=label,
stackgroup="one",
hoverinfo="x+y",
),
row=1,
col=1,
)
likes_per_label = df.groupby("labels")["likes"].mean().reset_index()
fig.add_trace(
go.Bar(
x=likes_per_label["labels"],
y=likes_per_label["likes"],
showlegend=False,
marker_color=px.colors.qualitative.Plotly,
opacity=0.6,
),
row=1,
col=2,
)
fig.update_yaxes(title_text="Number of Tweets", row=1, col=1)
fig.update_yaxes(title_text="Number of Likes", row=1, col=2)
fig.update_layout(height=350, width=750)
st.plotly_chart(fig)
# tweet sample
st.markdown(df.sample(n=5).to_markdown()) |