import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import nltk from sklearn.feature_extraction.text import CountVectorizer import re from nltk.corpus import stopwords from nltk.stem import SnowballStemmer # Download NLTK punkt tokenizer and stopwords nltk.download('punkt') nltk.download('stopwords') # Function to preprocess text def preprocess_text(text): # Convert text to lowercase text = text.lower() # Remove URLs text = re.sub(r'http\S+', '', text) # Remove @ mentions text = re.sub(r'@\S+', '', text) # Remove hashtags text = re.sub(r'#\S+', '', text) # Remove non-alphabetic characters text = re.sub(r'[^a-zA-Z]', ' ', text) # Tokenize text tokens = nltk.word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) filtered_tokens = [word for word in tokens if word not in stop_words] # Stemming stemmer = SnowballStemmer('english') stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] # Join tokens into a single string processed_text = ' '.join(stemmed_tokens) return processed_text # Function to load and preprocess DataFrame def load_and_preprocess_dataframe(file_path): dataframe = pd.read_csv(file_path) dataframe['processed_text'] = dataframe['text'].apply(preprocess_text) return dataframe # Function to create Document-Term Matrix (DTM) def create_dtm(dataframe): vectorizer = CountVectorizer() dtm = vectorizer.fit_transform(dataframe['processed_text']) return dtm, vectorizer # Function to plot word frequency def plot_word_frequency(dtm, vectorizer): # Sum word frequencies word_freq = dtm.sum(axis=0) words = vectorizer.get_feature_names_out() # Create DataFrame word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)}) # Sort by frequency word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False) # Plot plt.figure(figsize=(10, 6)) sns.barplot(x='word', y='frequency', data=word_freq_df.head(20)) plt.title('Top 20 Words Frequency') plt.xlabel('Words') plt.ylabel('Frequency') plt.xticks(rotation=45, ha='right') plt.tight_layout() st.pyplot() # Streamlit app def main(): st.title("Tweet Data Analysis") st.sidebar.header("Upload CSV file") uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: st.sidebar.success('File uploaded successfully!') st.sidebar.markdown("### Preprocessing Options") preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame") # Load DataFrame df = pd.read_csv(uploaded_file) if preprocess_checkbox: # Preprocess DataFrame df['processed_text'] = df['text'].apply(preprocess_text) st.subheader("Preprocessed DataFrame") st.write(df.head()) # Create DTM st.subheader("Document-Term Matrix (DTM)") dtm, vectorizer = create_dtm(df) st.write(dtm) # Plot word frequency st.subheader("Word Frequency Plot") plot_word_frequency(dtm, vectorizer) else: st.subheader("Original DataFrame") st.write(df.head()) if __name__ == "__main__": main()