Ahtisham1583 commited on
Commit
013f8ea
·
verified ·
1 Parent(s): 2599dc0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import nltk
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+ import re
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import SnowballStemmer
11
+
12
+ # Download NLTK punkt tokenizer and stopwords
13
+ nltk.download('punkt')
14
+ nltk.download('stopwords')
15
+
16
+ # Function to preprocess text
17
+ def preprocess_text(text):
18
+ # Convert text to lowercase
19
+ text = text.lower()
20
+ # Remove URLs
21
+ text = re.sub(r'http\S+', '', text)
22
+ # Remove @ mentions
23
+ text = re.sub(r'@\S+', '', text)
24
+ # Remove hashtags
25
+ text = re.sub(r'#\S+', '', text)
26
+ # Remove non-alphabetic characters
27
+ text = re.sub(r'[^a-zA-Z]', ' ', text)
28
+ # Tokenize text
29
+ tokens = nltk.word_tokenize(text)
30
+ # Remove stopwords
31
+ stop_words = set(stopwords.words('english'))
32
+ filtered_tokens = [word for word in tokens if word not in stop_words]
33
+ # Stemming
34
+ stemmer = SnowballStemmer('english')
35
+ stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
36
+ # Join tokens into a single string
37
+ processed_text = ' '.join(stemmed_tokens)
38
+ return processed_text
39
+
40
+ # Function to load and preprocess DataFrame
41
+ def load_and_preprocess_dataframe(file_path):
42
+ dataframe = pd.read_csv(file_path)
43
+ dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
44
+ return dataframe
45
+
46
+ # Function to create Document-Term Matrix (DTM)
47
+ def create_dtm(dataframe):
48
+ vectorizer = CountVectorizer()
49
+ dtm = vectorizer.fit_transform(dataframe['processed_text'])
50
+ return dtm, vectorizer
51
+
52
+ # Function to plot word frequency
53
+ def plot_word_frequency(dtm, vectorizer):
54
+ # Sum word frequencies
55
+ word_freq = dtm.sum(axis=0)
56
+ words = vectorizer.get_feature_names_out()
57
+ # Create DataFrame
58
+ word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
59
+ # Sort by frequency
60
+ word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
61
+ # Plot
62
+ plt.figure(figsize=(10, 6))
63
+ sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
64
+ plt.title('Top 20 Words Frequency')
65
+ plt.xlabel('Words')
66
+ plt.ylabel('Frequency')
67
+ plt.xticks(rotation=45, ha='right')
68
+ plt.tight_layout()
69
+ st.pyplot()
70
+
71
+ # Streamlit app
72
+ def main():
73
+ st.title("Tweet Data Analysis")
74
+ st.sidebar.header("Upload CSV file")
75
+
76
+ uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
77
+
78
+ if uploaded_file is not None:
79
+ st.sidebar.success('File uploaded successfully!')
80
+ st.sidebar.markdown("### Preprocessing Options")
81
+ preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")
82
+
83
+ # Load DataFrame
84
+ df = pd.read_csv(uploaded_file)
85
+
86
+ if preprocess_checkbox:
87
+ # Preprocess DataFrame
88
+ df['processed_text'] = df['text'].apply(preprocess_text)
89
+ st.subheader("Preprocessed DataFrame")
90
+ st.write(df.head())
91
+
92
+ # Create DTM
93
+ st.subheader("Document-Term Matrix (DTM)")
94
+ dtm, vectorizer = create_dtm(df)
95
+ st.write(dtm)
96
+
97
+ # Plot word frequency
98
+ st.subheader("Word Frequency Plot")
99
+ plot_word_frequency(dtm, vectorizer)
100
+ else:
101
+ st.subheader("Original DataFrame")
102
+ st.write(df.head())
103
+
104
+ if __name__ == "__main__":
105
+ main()