import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import utils
import os
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
import gradio as gr
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS

# Load the dataset
df = pd.read_csv("Twitter_Data.csv")

# Check for missing values and fill or drop them accordingly
df['clean_text'].fillna('', inplace=True)
df.dropna(subset=['category'], inplace=True)
df.drop_duplicates(inplace=True)

# Tokenize words
tokenized_text = [word_tokenize(text.lower()) for text in df['clean_text']]

# Word2Vec model
from gensim.models import Word2Vec
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Define input and target variables
X = df['clean_text']
y = df['category']

# Encode target variable
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = utils.to_categorical(y)

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Max sequence length
max_seq_length = max([len(seq) for seq in sequences])

# Pad sequences
X_pad = pad_sequences(sequences, maxlen=max_seq_length)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=3, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model
history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=[reduce_lr, early_stop])

# Save the model
model_path = 'sentiment_analysis_model.h5'
model.save(model_path)

# Define a function to classify sentiment
def classify_sentiment(text):
    # Preprocess the text (tokenization, padding, etc.)
    text_sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(text_sequence, maxlen=max_seq_length)

    # Make prediction using the trained model
    prediction = model.predict(padded_sequence)

    # Convert prediction to class label
    predicted_label = np.argmax(prediction)

    # Map class label to sentiment
    sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
    sentiment = sentiment_mapping[predicted_label]

    return sentiment

# Define the Gradio interface
def gradio_sentiment_analysis(text):
    sentiment = classify_sentiment(text)
    return sentiment

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_sentiment_analysis,
    inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
    outputs="text",
    title="Sentiment Analysis",
    description="Enter a sentence to classify its sentiment as Positive, Neutral, or Negative."
)

# Launch the Gradio app
iface.launch()