import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download NLTK punkt tokenizer and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove @ mentions
    text = re.sub(r'@\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Join tokens into a single string
    processed_text = ' '.join(stemmed_tokens)
    return processed_text

# Function to load and preprocess DataFrame
def load_and_preprocess_dataframe(file_path):
    dataframe = pd.read_csv(file_path)
    dataframe['processed_text'] = dataframe['text'].apply(preprocess_text)
    return dataframe

# Function to create Document-Term Matrix (DTM)
def create_dtm(dataframe):
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(dataframe['processed_text'])
    return dtm, vectorizer

# Function to plot word frequency
def plot_word_frequency(dtm, vectorizer):
    # Sum word frequencies
    word_freq = dtm.sum(axis=0)
    words = vectorizer.get_feature_names_out()
    # Create DataFrame
    word_freq_df = pd.DataFrame({'word': words, 'frequency': np.ravel(word_freq)})
    # Sort by frequency
    word_freq_df = word_freq_df.sort_values(by='frequency', ascending=False)
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x='word', y='frequency', data=word_freq_df.head(20))
    plt.title('Top 20 Words Frequency')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    st.pyplot()

# Streamlit app
def main():
    st.title("Tweet Data Analysis")
    st.sidebar.header("Upload CSV file")

    uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type="csv")

    if uploaded_file is not None:
        st.sidebar.success('File uploaded successfully!')
        st.sidebar.markdown("### Preprocessing Options")
        preprocess_checkbox = st.sidebar.checkbox("Preprocess DataFrame")

        # Load DataFrame
        df = pd.read_csv(uploaded_file)

        if preprocess_checkbox:
            # Preprocess DataFrame
            df['processed_text'] = df['text'].apply(preprocess_text)
            st.subheader("Preprocessed DataFrame")
            st.write(df.head())

            # Create DTM
            st.subheader("Document-Term Matrix (DTM)")
            dtm, vectorizer = create_dtm(df)
            st.write(dtm)

            # Plot word frequency
            st.subheader("Word Frequency Plot")
            plot_word_frequency(dtm, vectorizer)
        else:
            st.subheader("Original DataFrame")
            st.write(df.head())

if __name__ == "__main__":
    main()