Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import nltk | |
from sklearn.feature_extraction.text import CountVectorizer | |
import re | |
from nltk.corpus import stopwords | |
from nltk.stem import SnowballStemmer | |
# Download NLTK punkt tokenizer and stopwords | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Function to preprocess text | |
def preprocess_text(text): | |
# Convert text to lowercase | |
text = text.lower() | |
# Remove URLs | |
text = re.sub(r'http\S+', '', text) | |
# Remove @ mentions | |
text = re.sub(r'@\S+', '', text) | |
# Remove hashtags | |
text = re.sub(r'#\S+', '', text) | |
# Remove non-alphabetic characters | |
text = re.sub(r'[^a-zA-Z]', ' ', text) | |
# Tokenize text | |
tokens = nltk.word_tokenize(text) | |
# Remove stopwords | |
stop_words = set(stopwords.words('english')) | |
filtered_tokens = [word for word in tokens if word not in stop_words] | |
# Stemming | |
stemmer = SnowballStemmer('english') | |
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] | |
# Join tokens into a single string | |
processed_text = ' '.join(stemmed_tokens) | |
return processed_text | |
# Function to load and preprocess DataFrame | |
def load_and_preprocess_dataframe(file_path): | |
dataframe = pd.read_csv(file_path) | |
dataframe['processed_text'] = dataframe['text'].apply(preprocess_text) | |
return dataframe | |
# Function to create Document-Term Matrix (DTM) | |
def create_dtm(dataframe): | |
vectorizer = CountVectorizer() | |
dtm = vectorizer.fit_transform(dataframe['processed_text']) | |
return dtm, vectorizer | |
# Function to plot word frequency | |
def plot_word_frequency(dtm, vectorizer): | |
# Sum word frequencies | |
word_freq = dtm.sum(axis=0) | |
words = vectorizer.get_feature_names_out() | |
# Create DataFrame | |
word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)}) | |
# Sort by frequency | |
word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False) | |
# Plot | |
plt.figure(figsize=(10, 6)) | |
sns.barplot(x='word', y='frequency', data=word_freq_df.head(20)) | |
plt.title('Top 20 Words Frequency') | |
plt.xlabel('Words') | |
plt.ylabel('Frequency') | |
plt.xticks(rotation=45, ha='right') | |
plt.tight_layout() | |
st.pyplot() | |
# Streamlit app | |
def main(): | |
st.title("Tweet Data Analysis") | |
st.sidebar.header("Upload CSV file") | |
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
st.sidebar.success('File uploaded successfully!') | |
st.sidebar.markdown("### Preprocessing Options") | |
preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame") | |
# Load DataFrame | |
df = pd.read_csv(uploaded_file) | |
if preprocess_checkbox: | |
# Preprocess DataFrame | |
df['processed_text'] = df['text'].apply(preprocess_text) | |
st.subheader("Preprocessed DataFrame") | |
st.write(df.head()) | |
# Create DTM | |
st.subheader("Document-Term Matrix (DTM)") | |
dtm, vectorizer = create_dtm(df) | |
st.write(dtm) | |
# Plot word frequency | |
st.subheader("Word Frequency Plot") | |
plot_word_frequency(dtm, vectorizer) | |
else: | |
st.subheader("Original DataFrame") | |
st.write(df.head()) | |
if __name__ == "__main__": | |
main() | |