Spaces:

Ahtisham1583
/

Social_media_senti

Runtime error

App Files Files Community

Ahtisham1583 commited on May 10, 2024

Commit

ec3726f

verified ·

1 Parent(s): a9b1505

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -85

app.py CHANGED Viewed

@@ -1,105 +1,54 @@
-import streamlit as st
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
-import seaborn as sns
-import nltk
 from sklearn.feature_extraction.text import CountVectorizer
 import re
 from nltk.corpus import stopwords
 from nltk.stem import SnowballStemmer
-# Download NLTK punkt tokenizer and stopwords
-nltk.download('punkt')
 nltk.download('stopwords')
 # Function to preprocess text
 def preprocess_text(text):
-    # Convert text to lowercase
-    text = text.lower()
-    # Remove URLs
-    text = re.sub(r'http\S+', '', text)
-    # Remove @ mentions
-    text = re.sub(r'@\S+', '', text)
-    # Remove hashtags
-    text = re.sub(r'#\S+', '', text)
-    # Remove non-alphabetic characters
-    text = re.sub(r'[^a-zA-Z]', ' ', text)
-    # Tokenize text
-    tokens = nltk.word_tokenize(text)
-    # Remove stopwords
     stop_words = set(stopwords.words('english'))
-    filtered_tokens = [word for word in tokens if word not in stop_words]
-    # Stemming
     stemmer = SnowballStemmer('english')
-    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
-    # Join tokens into a single string
-    processed_text = ' '.join(stemmed_tokens)
-    return processed_text
-# Function to load and preprocess DataFrame
-def load_and_preprocess_dataframe(file_path):
-    dataframe = pd.read_csv(file_path)
-    dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
-    return dataframe
-# Function to create Document-Term Matrix (DTM)
-def create_dtm(dataframe):
-    vectorizer = CountVectorizer()
-    dtm = vectorizer.fit_transform(dataframe['processed_text'])
-    return dtm, vectorizer
-# Function to plot word frequency
-def plot_word_frequency(dtm, vectorizer):
-    # Sum word frequencies
-    word_freq = dtm.sum(axis=0)
-    words = vectorizer.get_feature_names_out()
-    # Create DataFrame
-    word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
-    # Sort by frequency
-    word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
-    # Plot
-    plt.figure(figsize=(10, 6))
-    sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
-    plt.title('Top 20 Words Frequency')
-    plt.xlabel('Words')
-    plt.ylabel('Frequency')
-    plt.xticks(rotation=45, ha='right')
-    plt.tight_layout()
-    st.pyplot()
-# Streamlit app
-def main():
-    st.title("Tweet Data Analysis")
-    st.sidebar.header("Upload CSV file")
-    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
-    if uploaded_file is not None:
-        st.sidebar.success('File uploaded successfully!')
-        st.sidebar.markdown("### Preprocessing Options")
-        preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")
-        # Load DataFrame
-        df = pd.read_csv(uploaded_file)
-        if preprocess_checkbox:
-            # Preprocess DataFrame
-            df['processed_text'] = df['text'].apply(preprocess_text)
-            st.subheader("Preprocessed DataFrame")
-            st.write(df.head())
-            # Create DTM
-            st.subheader("Document-Term Matrix (DTM)")
-            dtm, vectorizer = create_dtm(df)
-            st.write(dtm)
-            # Plot word frequency
-            st.subheader("Word Frequency Plot")
-            plot_word_frequency(dtm, vectorizer)
-        else:
-            st.subheader("Original DataFrame")
-            st.write(df.head())
-if __name__ == "__main__":
-    main()

 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import CountVectorizer
 import re
 from nltk.corpus import stopwords
 from nltk.stem import SnowballStemmer
+import nltk
 nltk.download('stopwords')
+# Load data
+# data = pd.read_csv('path_to_your_data.csv')
+# Assuming data has already been loaded if this line is not needed
 # Function to preprocess text
 def preprocess_text(text):
+    text = text.lower()  # Convert to lowercase
+    text = re.sub(r'@\w+', ' ', text)  # Remove mentions
+    text = re.sub(r'http\S+', ' ', text)  # Remove URLs
+    text = re.sub(r'[^a-z]', ' ', text)  # Keep only alphabetic characters
     stop_words = set(stopwords.words('english'))
+    words = [word for word in text.split() if word not in stop_words]
     stemmer = SnowballStemmer('english')
+    return ' '.join(stemmer.stem(word) for word in words)
+# Apply preprocessing
+data['clean_text'] = data['text'].apply(preprocess_text)
+# Create the Document-Term Matrix
+vectorizer = CountVectorizer()
+dtm = vectorizer.fit_transform(data['clean_text'])
+# Sum frequencies of each term across documents
+sum_words = dtm.sum(axis=0)
+words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
+words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
+# Convert to DataFrame
+wf = pd.DataFrame(words_freq, columns=['word', 'freq'])
+# Filter words with frequency greater than 50
+wf_filtered = wf[wf['freq'] > 50]
+# Create figure and axes for the plot
+fig, ax = plt.subplots(figsize=(12, 8))
+ax.bar(wf_filtered['word'], wf_filtered['freq'], color='skyblue')
+ax.set_xlabel('Words')
+ax.set_ylabel('Frequency')
+ax.set_title('Frequency of Terms in Text Data')
+ax.set_xticklabels(wf_filtered['word'], rotation=45, ha='right')
+# Show the plot
+plt.show()