import streamlit as st
import snscrape.modules.twitter as sntwitter
import pandas as pd
import plotly.express as px
# import base64
import pybase64
st.set_page_config(page_title="Scraping Twitter")
st.title('Scraping Twitter')
# Input query and number of tweets to scrape
query = st.text_input('Enter a search query:', 'data science')
num_tweets = st.number_input(
'Number of tweets to scrape:', min_value=1, max_value=1000000, step=1)
# Scrape tweets and store data in a dataframe
def scrape_tweets(query, num_tweets):
tweets_list = []
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query + ' lang:id').get_items()):
if i >= num_tweets:
break
tweets_list.append([tweet.id, tweet.date, tweet.content, tweet.user.username,
tweet.user.followersCount, tweet.url, tweet.user.id])
tweets_df = pd.DataFrame(tweets_list, columns=[
'Tweet Id', 'Datetime', 'Text', 'Username', 'Followers', 'URL', 'User Id'])
return tweets_df
if st.button('Scrape Tweets'):
tweets_df = scrape_tweets(query, num_tweets)
st.success('Scraping done!')
# Display data
st.write(tweets_df)
# Line plot of tweet count over time
tweets_df['Date'] = tweets_df['Datetime'].dt.date
tweets_by_date = tweets_df.groupby(
['Date'])['Tweet Id'].count().reset_index()
fig = px.line(tweets_by_date, x='Date', y='Tweet Id')
st.plotly_chart(fig)
# Scatter plot of followers vs tweet count
fig = px.scatter(tweets_df, x='Followers', y='Tweet Id')
st.plotly_chart(fig)
# Username selection and interaction count
st.sidebar.title('Username and Interaction Count')
selected_username = st.sidebar.selectbox(
'Select a username:', options=tweets_df['Username'].unique())
st.sidebar.write(
f'Interactions with @{selected_username}: {tweets_df[tweets_df["Username"] == selected_username].shape[0]}')
# Interaction count by username
interactions_by_user = tweets_df.groupby(['Username'])['Tweet Id'].count(
).reset_index().sort_values(by=['Tweet Id'], ascending=False)
fig = px.bar(interactions_by_user, x='Username', y='Tweet Id')
st.plotly_chart(fig)
# Interaction count with selected username over time
tweets_by_date = tweets_df[tweets_df['Username'] == selected_username].groupby(
['Date'])['Tweet Id'].count().reset_index()
fig = px.line(tweets_by_date, x='Date', y='Tweet Id')
st.plotly_chart(fig)
# Tweets involving selected username
st.write(f'Tweets involving @{selected_username}:')
st.write(tweets_df[tweets_df['Username'] == selected_username])
# Download CSV
if len(tweets_df) > 0:
csv = tweets_df.to_csv(index=False)
b64 = pybase64.b64encode(csv.encode()).decode()
filename = f'tweets_{query.replace(" ", "_")}.csv'
href = f'Download CSV'
st.markdown(href, unsafe_allow_html=True)
if len(tweets_df) > 0:
txt = "\n\n".join(tweets_df['Text'].tolist())
b64 = pybase64.b64encode(txt.encode()).decode()
txt_filename = f"{query}_tweets.txt"
txt_href = f'Download TXT'
st.markdown(txt_href, unsafe_allow_html=True)