Spaces:

Ahtisham1583
/

Social_media_senti

Runtime error

App Files Files Community

Ahtisham1583 commited on May 10, 2024

Commit

013f8ea

verified ·

1 Parent(s): 2599dc0

Create app.py

Browse files

Files changed (1) hide show

app.py +105 -0

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import nltk
+from sklearn.feature_extraction.text import CountVectorizer
+import re
+from nltk.corpus import stopwords
+from nltk.stem import SnowballStemmer
+# Download NLTK punkt tokenizer and stopwords
+nltk.download('punkt')
+nltk.download('stopwords')
+# Function to preprocess text
+def preprocess_text(text):
+    # Convert text to lowercase
+    text = text.lower()
+    # Remove URLs
+    text = re.sub(r'http\S+', '', text)
+    # Remove @ mentions
+    text = re.sub(r'@\S+', '', text)
+    # Remove hashtags
+    text = re.sub(r'#\S+', '', text)
+    # Remove non-alphabetic characters
+    text = re.sub(r'[^a-zA-Z]', ' ', text)
+    # Tokenize text
+    tokens = nltk.word_tokenize(text)
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [word for word in tokens if word not in stop_words]
+    # Stemming
+    stemmer = SnowballStemmer('english')
+    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
+    # Join tokens into a single string
+    processed_text = ' '.join(stemmed_tokens)
+    return processed_text
+# Function to load and preprocess DataFrame
+def load_and_preprocess_dataframe(file_path):
+    dataframe = pd.read_csv(file_path)
+    dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
+    return dataframe
+# Function to create Document-Term Matrix (DTM)
+def create_dtm(dataframe):
+    vectorizer = CountVectorizer()
+    dtm = vectorizer.fit_transform(dataframe['processed_text'])
+    return dtm, vectorizer
+# Function to plot word frequency
+def plot_word_frequency(dtm, vectorizer):
+    # Sum word frequencies
+    word_freq = dtm.sum(axis=0)
+    words = vectorizer.get_feature_names_out()
+    # Create DataFrame
+    word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
+    # Sort by frequency
+    word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
+    # Plot
+    plt.figure(figsize=(10, 6))
+    sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
+    plt.title('Top 20 Words Frequency')
+    plt.xlabel('Words')
+    plt.ylabel('Frequency')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    st.pyplot()
+# Streamlit app
+def main():
+    st.title("Tweet Data Analysis")
+    st.sidebar.header("Upload CSV file")
+    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
+    if uploaded_file is not None:
+        st.sidebar.success('File uploaded successfully!')
+        st.sidebar.markdown("### Preprocessing Options")
+        preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")
+        # Load DataFrame
+        df = pd.read_csv(uploaded_file)
+        if preprocess_checkbox:
+            # Preprocess DataFrame
+            df['processed_text'] = df['text'].apply(preprocess_text)
+            st.subheader("Preprocessed DataFrame")
+            st.write(df.head())
+            # Create DTM
+            st.subheader("Document-Term Matrix (DTM)")
+            dtm, vectorizer = create_dtm(df)
+            st.write(dtm)
+            # Plot word frequency
+            st.subheader("Word Frequency Plot")
+            plot_word_frequency(dtm, vectorizer)
+        else:
+            st.subheader("Original DataFrame")
+            st.write(df.head())
+if __name__ == "__main__":
+    main()