File size: 3,394 Bytes
cdfa715
013f8ea
 
 
cdfa715
 
013f8ea
 
 
 
ec3726f
cdfa715
 
 
09ae9ce
013f8ea
 
cdfa715
 
 
 
 
 
 
 
 
 
 
 
 
013f8ea
cdfa715
 
013f8ea
cdfa715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
013f8ea
cdfa715
013f8ea
cdfa715
 
 
 
013f8ea
cdfa715
 
013f8ea
cdfa715
 
 
 
 
013f8ea
cdfa715
 
 
 
013f8ea
cdfa715
 
 
 
 
 
013f8ea
cdfa715
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download NLTK punkt tokenizer and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove @ mentions
    text = re.sub(r'@\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Join tokens into a single string
    processed_text = ' '.join(stemmed_tokens)
    return processed_text

# Function to load and preprocess DataFrame
def load_and_preprocess_dataframe(file_path):
    dataframe = pd.read_csv(file_path)
    dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
    return dataframe

# Function to create Document-Term Matrix (DTM)
def create_dtm(dataframe):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(dataframe['processed_text'])
    return dtm, vectorizer

# Function to plot word frequency
def plot_word_frequency(dtm, vectorizer):
    # Sum word frequencies
    word_freq = dtm.sum(axis=0)
    words = vectorizer.get_feature_names_out()
    # Create DataFrame
    word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
    # Sort by frequency
    word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
    plt.title('Top 20 Words Frequency')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    st.pyplot()

# Streamlit app
def main():
    st.title("Tweet Data Analysis")
    st.sidebar.header("Upload CSV file")

    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")

    if uploaded_file is not None:
        st.sidebar.success('File uploaded successfully!')
        st.sidebar.markdown("### Preprocessing Options")
        preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")

        # Load DataFrame
        df = pd.read_csv(uploaded_file)

        if preprocess_checkbox:
            # Preprocess DataFrame
            df['processed_text'] = df['text'].apply(preprocess_text)
            st.subheader("Preprocessed DataFrame")
            st.write(df.head())

            # Create DTM
            st.subheader("Document-Term Matrix (DTM)")
            dtm, vectorizer = create_dtm(df)
            st.write(dtm)

            # Plot word frequency
            st.subheader("Word Frequency Plot")
            plot_word_frequency(dtm, vectorizer)
        else:
            st.subheader("Original DataFrame")
            st.write(df.head())

if __name__ == "__main__":
    main()