Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,54 +1,105 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import matplotlib.pyplot as plt
|
|
|
|
|
4 |
from sklearn.feature_extraction.text import CountVectorizer
|
5 |
import re
|
6 |
from nltk.corpus import stopwords
|
7 |
from nltk.stem import SnowballStemmer
|
8 |
-
import nltk
|
9 |
-
nltk.download('stopwords')
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
|
14 |
-
# Assuming data has already been loaded if this line is not needed
|
15 |
|
16 |
# Function to preprocess text
|
17 |
def preprocess_text(text):
|
18 |
-
|
19 |
-
text =
|
20 |
-
|
21 |
-
text = re.sub(r'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
stop_words = set(stopwords.words('english'))
|
23 |
-
|
|
|
24 |
stemmer = SnowballStemmer('english')
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
data['clean_text'] = data['text'].apply(preprocess_text)
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
-
#
|
35 |
-
|
36 |
-
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
|
37 |
-
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
|
38 |
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
|
42 |
-
#
|
43 |
-
|
|
|
|
|
44 |
|
45 |
-
#
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
ax.set_xticklabels(wf_filtered['word'], rotation=45, ha='right')
|
52 |
|
53 |
-
|
54 |
-
|
|
|
1 |
+
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
import nltk
|
7 |
from sklearn.feature_extraction.text import CountVectorizer
|
8 |
import re
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.stem import SnowballStemmer
|
|
|
|
|
11 |
|
12 |
+
# Download NLTK punkt tokenizer and stopwords
|
13 |
+
nltk.download('punkt')
|
14 |
+
nltk.download('stopwords')
|
|
|
15 |
|
16 |
# Function to preprocess text
|
17 |
def preprocess_text(text):
|
18 |
+
# Convert text to lowercase
|
19 |
+
text = text.lower()
|
20 |
+
# Remove URLs
|
21 |
+
text = re.sub(r'http\S+', '', text)
|
22 |
+
# Remove @ mentions
|
23 |
+
text = re.sub(r'@\S+', '', text)
|
24 |
+
# Remove hashtags
|
25 |
+
text = re.sub(r'#\S+', '', text)
|
26 |
+
# Remove non-alphabetic characters
|
27 |
+
text = re.sub(r'[^a-zA-Z]', ' ', text)
|
28 |
+
# Tokenize text
|
29 |
+
tokens = nltk.word_tokenize(text)
|
30 |
+
# Remove stopwords
|
31 |
stop_words = set(stopwords.words('english'))
|
32 |
+
filtered_tokens = [word for word in tokens if word not in stop_words]
|
33 |
+
# Stemming
|
34 |
stemmer = SnowballStemmer('english')
|
35 |
+
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
|
36 |
+
# Join tokens into a single string
|
37 |
+
processed_text = ' '.join(stemmed_tokens)
|
38 |
+
return processed_text
|
39 |
+
|
40 |
+
# Function to load and preprocess DataFrame
|
41 |
+
def load_and_preprocess_dataframe(file_path):
|
42 |
+
dataframe = pd.read_csv(file_path)
|
43 |
+
dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
|
44 |
+
return dataframe
|
45 |
+
|
46 |
+
# Function to create Document-Term Matrix (DTM)
|
47 |
+
def create_dtm(dataframe):
|
48 |
+
vectorizer = CountVectorizer()
|
49 |
+
dtm = vectorizer.fit_transform(dataframe['processed_text'])
|
50 |
+
return dtm, vectorizer
|
51 |
+
|
52 |
+
# Function to plot word frequency
|
53 |
+
def plot_word_frequency(dtm, vectorizer):
|
54 |
+
# Sum word frequencies
|
55 |
+
word_freq = dtm.sum(axis=0)
|
56 |
+
words = vectorizer.get_feature_names_out()
|
57 |
+
# Create DataFrame
|
58 |
+
word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
|
59 |
+
# Sort by frequency
|
60 |
+
word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
|
61 |
+
# Plot
|
62 |
+
plt.figure(figsize=(10, 6))
|
63 |
+
sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
|
64 |
+
plt.title('Top 20 Words Frequency')
|
65 |
+
plt.xlabel('Words')
|
66 |
+
plt.ylabel('Frequency')
|
67 |
+
plt.xticks(rotation=45, ha='right')
|
68 |
+
plt.tight_layout()
|
69 |
+
st.pyplot()
|
70 |
+
|
71 |
+
# Streamlit app
|
72 |
+
def main():
|
73 |
+
st.title("Tweet Data Analysis")
|
74 |
+
st.sidebar.header("Upload CSV file")
|
75 |
|
76 |
+
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
|
|
|
77 |
|
78 |
+
if uploaded_file is not None:
|
79 |
+
st.sidebar.success('File uploaded successfully!')
|
80 |
+
st.sidebar.markdown("### Preprocessing Options")
|
81 |
+
preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")
|
82 |
|
83 |
+
# Load DataFrame
|
84 |
+
df = pd.read_csv(uploaded_file)
|
|
|
|
|
85 |
|
86 |
+
if preprocess_checkbox:
|
87 |
+
# Preprocess DataFrame
|
88 |
+
df['processed_text'] = df['text'].apply(preprocess_text)
|
89 |
+
st.subheader("Preprocessed DataFrame")
|
90 |
+
st.write(df.head())
|
91 |
|
92 |
+
# Create DTM
|
93 |
+
st.subheader("Document-Term Matrix (DTM)")
|
94 |
+
dtm, vectorizer = create_dtm(df)
|
95 |
+
st.write(dtm)
|
96 |
|
97 |
+
# Plot word frequency
|
98 |
+
st.subheader("Word Frequency Plot")
|
99 |
+
plot_word_frequency(dtm, vectorizer)
|
100 |
+
else:
|
101 |
+
st.subheader("Original DataFrame")
|
102 |
+
st.write(df.head())
|
|
|
103 |
|
104 |
+
if __name__ == "__main__":
|
105 |
+
main()
|