Spaces:
Runtime error
Runtime error
import streamlit as st | |
import snscrape.modules.twitter as sntwitter | |
import pandas as pd | |
import plotly.express as px | |
# import base64 | |
import pybase64 | |
st.set_page_config(page_title="Scraping Twitter") | |
st.title('Scraping Twitter') | |
# Input query and number of tweets to scrape | |
query = st.text_input('Enter a search query:', 'data science') | |
num_tweets = st.number_input( | |
'Number of tweets to scrape:', min_value=1, max_value=1000000, step=1) | |
# Scrape tweets and store data in a dataframe | |
def scrape_tweets(query, num_tweets): | |
tweets_list = [] | |
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query + ' lang:id').get_items()): | |
if i >= num_tweets: | |
break | |
tweets_list.append([tweet.id, tweet.date, tweet.content, tweet.user.username, | |
tweet.user.followersCount, tweet.url, tweet.user.id]) | |
tweets_df = pd.DataFrame(tweets_list, columns=[ | |
'Tweet Id', 'Datetime', 'Text', 'Username', 'Followers', 'URL', 'User Id']) | |
return tweets_df | |
if st.button('Scrape Tweets'): | |
tweets_df = scrape_tweets(query, num_tweets) | |
st.success('Scraping done!') | |
# Display data | |
st.write(tweets_df) | |
# Line plot of tweet count over time | |
tweets_df['Date'] = tweets_df['Datetime'].dt.date | |
tweets_by_date = tweets_df.groupby( | |
['Date'])['Tweet Id'].count().reset_index() | |
fig = px.line(tweets_by_date, x='Date', y='Tweet Id') | |
st.plotly_chart(fig) | |
# Scatter plot of followers vs tweet count | |
fig = px.scatter(tweets_df, x='Followers', y='Tweet Id') | |
st.plotly_chart(fig) | |
# Username selection and interaction count | |
st.sidebar.title('Username and Interaction Count') | |
selected_username = st.sidebar.selectbox( | |
'Select a username:', options=tweets_df['Username'].unique()) | |
st.sidebar.write( | |
f'Interactions with @{selected_username}: {tweets_df[tweets_df["Username"] == selected_username].shape[0]}') | |
# Interaction count by username | |
interactions_by_user = tweets_df.groupby(['Username'])['Tweet Id'].count( | |
).reset_index().sort_values(by=['Tweet Id'], ascending=False) | |
fig = px.bar(interactions_by_user, x='Username', y='Tweet Id') | |
st.plotly_chart(fig) | |
# Interaction count with selected username over time | |
tweets_by_date = tweets_df[tweets_df['Username'] == selected_username].groupby( | |
['Date'])['Tweet Id'].count().reset_index() | |
fig = px.line(tweets_by_date, x='Date', y='Tweet Id') | |
st.plotly_chart(fig) | |
# Tweets involving selected username | |
st.write(f'Tweets involving @{selected_username}:') | |
st.write(tweets_df[tweets_df['Username'] == selected_username]) | |
# Download CSV | |
if len(tweets_df) > 0: | |
csv = tweets_df.to_csv(index=False) | |
b64 = pybase64.b64encode(csv.encode()).decode() | |
filename = f'tweets_{query.replace(" ", "_")}.csv' | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>' | |
st.markdown(href, unsafe_allow_html=True) | |
if len(tweets_df) > 0: | |
txt = "\n\n".join(tweets_df['Text'].tolist()) | |
b64 = pybase64.b64encode(txt.encode()).decode() | |
txt_filename = f"{query}_tweets.txt" | |
txt_href = f'<a href="data:file/txt;base64,{b64}" download="{txt_filename}">Download TXT</a>' | |
st.markdown(txt_href, unsafe_allow_html=True) | |