Ahtisham1583 commited on
Commit
ec3726f
·
verified ·
1 Parent(s): a9b1505

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -85
app.py CHANGED
@@ -1,105 +1,54 @@
1
- import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
- import seaborn as sns
6
- import nltk
7
  from sklearn.feature_extraction.text import CountVectorizer
8
  import re
9
  from nltk.corpus import stopwords
10
  from nltk.stem import SnowballStemmer
11
-
12
- # Download NLTK punkt tokenizer and stopwords
13
- nltk.download('punkt')
14
  nltk.download('stopwords')
15
 
 
 
 
 
 
16
  # Function to preprocess text
17
  def preprocess_text(text):
18
- # Convert text to lowercase
19
- text = text.lower()
20
- # Remove URLs
21
- text = re.sub(r'http\S+', '', text)
22
- # Remove @ mentions
23
- text = re.sub(r'@\S+', '', text)
24
- # Remove hashtags
25
- text = re.sub(r'#\S+', '', text)
26
- # Remove non-alphabetic characters
27
- text = re.sub(r'[^a-zA-Z]', ' ', text)
28
- # Tokenize text
29
- tokens = nltk.word_tokenize(text)
30
- # Remove stopwords
31
  stop_words = set(stopwords.words('english'))
32
- filtered_tokens = [word for word in tokens if word not in stop_words]
33
- # Stemming
34
  stemmer = SnowballStemmer('english')
35
- stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
36
- # Join tokens into a single string
37
- processed_text = ' '.join(stemmed_tokens)
38
- return processed_text
39
-
40
- # Function to load and preprocess DataFrame
41
- def load_and_preprocess_dataframe(file_path):
42
- dataframe = pd.read_csv(file_path)
43
- dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
44
- return dataframe
45
-
46
- # Function to create Document-Term Matrix (DTM)
47
- def create_dtm(dataframe):
48
- vectorizer = CountVectorizer()
49
- dtm = vectorizer.fit_transform(dataframe['processed_text'])
50
- return dtm, vectorizer
51
-
52
- # Function to plot word frequency
53
- def plot_word_frequency(dtm, vectorizer):
54
- # Sum word frequencies
55
- word_freq = dtm.sum(axis=0)
56
- words = vectorizer.get_feature_names_out()
57
- # Create DataFrame
58
- word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
59
- # Sort by frequency
60
- word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
61
- # Plot
62
- plt.figure(figsize=(10, 6))
63
- sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
64
- plt.title('Top 20 Words Frequency')
65
- plt.xlabel('Words')
66
- plt.ylabel('Frequency')
67
- plt.xticks(rotation=45, ha='right')
68
- plt.tight_layout()
69
- st.pyplot()
70
-
71
- # Streamlit app
72
- def main():
73
- st.title("Tweet Data Analysis")
74
- st.sidebar.header("Upload CSV file")
75
 
76
- uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")
 
77
 
78
- if uploaded_file is not None:
79
- st.sidebar.success('File uploaded successfully!')
80
- st.sidebar.markdown("### Preprocessing Options")
81
- preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")
82
 
83
- # Load DataFrame
84
- df = pd.read_csv(uploaded_file)
 
 
85
 
86
- if preprocess_checkbox:
87
- # Preprocess DataFrame
88
- df['processed_text'] = df['text'].apply(preprocess_text)
89
- st.subheader("Preprocessed DataFrame")
90
- st.write(df.head())
91
 
92
- # Create DTM
93
- st.subheader("Document-Term Matrix (DTM)")
94
- dtm, vectorizer = create_dtm(df)
95
- st.write(dtm)
96
 
97
- # Plot word frequency
98
- st.subheader("Word Frequency Plot")
99
- plot_word_frequency(dtm, vectorizer)
100
- else:
101
- st.subheader("Original DataFrame")
102
- st.write(df.head())
 
103
 
104
- if __name__ == "__main__":
105
- main()
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import matplotlib.pyplot as plt
 
 
4
  from sklearn.feature_extraction.text import CountVectorizer
5
  import re
6
  from nltk.corpus import stopwords
7
  from nltk.stem import SnowballStemmer
8
+ import nltk
 
 
9
  nltk.download('stopwords')
10
 
11
+ # Load data
12
+ # data = pd.read_csv('path_to_your_data.csv')
13
+
14
+ # Assuming data has already been loaded if this line is not needed
15
+
16
  # Function to preprocess text
17
  def preprocess_text(text):
18
+ text = text.lower() # Convert to lowercase
19
+ text = re.sub(r'@\w+', ' ', text) # Remove mentions
20
+ text = re.sub(r'http\S+', ' ', text) # Remove URLs
21
+ text = re.sub(r'[^a-z]', ' ', text) # Keep only alphabetic characters
 
 
 
 
 
 
 
 
 
22
  stop_words = set(stopwords.words('english'))
23
+ words = [word for word in text.split() if word not in stop_words]
 
24
  stemmer = SnowballStemmer('english')
25
+ return ' '.join(stemmer.stem(word) for word in words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Apply preprocessing
28
+ data['clean_text'] = data['text'].apply(preprocess_text)
29
 
30
+ # Create the Document-Term Matrix
31
+ vectorizer = CountVectorizer()
32
+ dtm = vectorizer.fit_transform(data['clean_text'])
 
33
 
34
+ # Sum frequencies of each term across documents
35
+ sum_words = dtm.sum(axis=0)
36
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
37
+ words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
38
 
39
+ # Convert to DataFrame
40
+ wf = pd.DataFrame(words_freq, columns=['word', 'freq'])
 
 
 
41
 
42
+ # Filter words with frequency greater than 50
43
+ wf_filtered = wf[wf['freq'] > 50]
 
 
44
 
45
+ # Create figure and axes for the plot
46
+ fig, ax = plt.subplots(figsize=(12, 8))
47
+ ax.bar(wf_filtered['word'], wf_filtered['freq'], color='skyblue')
48
+ ax.set_xlabel('Words')
49
+ ax.set_ylabel('Frequency')
50
+ ax.set_title('Frequency of Terms in Text Data')
51
+ ax.set_xticklabels(wf_filtered['word'], rotation=45, ha='right')
52
 
53
+ # Show the plot
54
+ plt.show()