import numpy as np import pandas as pd import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from tensorflow.keras import utils import os import matplotlib.pyplot as plt from nltk.tokenize import word_tokenize import nltk import gradio as gr nltk.download('punkt') from wordcloud import WordCloud, STOPWORDS # Load the dataset df = pd.read_csv("Twitter_Data.csv") # Check for missing values and fill or drop them accordingly df['clean_text'].fillna('', inplace=True) df.dropna(subset=['category'], inplace=True) df.drop_duplicates(inplace=True) # Tokenize words tokenized_text = [word_tokenize(text.lower()) for text in df['clean_text']] # Word2Vec model from gensim.models import Word2Vec model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4) # Define input and target variables X = df['clean_text'] y = df['category'] # Encode target variable encoder = LabelEncoder() y = encoder.fit_transform(y) y = utils.to_categorical(y) # Tokenize text tokenizer = Tokenizer() tokenizer.fit_on_texts(X) sequences = tokenizer.texts_to_sequences(X) # Vocabulary size vocab_size = len(tokenizer.word_index) + 1 # Max sequence length max_seq_length = max([len(seq) for seq in sequences]) # Pad sequences X_pad = pad_sequences(sequences, maxlen=max_seq_length) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42) # Define LSTM model model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length)) model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(units=3, activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Define callbacks reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001) early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # Train model history = model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=[reduce_lr, early_stop]) # Save the model model_path = 'sentiment_analysis_model.h5' model.save(model_path) # Define a function to classify sentiment def classify_sentiment(text): # Preprocess the text (tokenization, padding, etc.) text_sequence = tokenizer.texts_to_sequences([text]) padded_sequence = pad_sequences(text_sequence, maxlen=max_seq_length) # Make prediction using the trained model prediction = model.predict(padded_sequence) # Convert prediction to class label predicted_label = np.argmax(prediction) # Map class label to sentiment sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"} sentiment = sentiment_mapping[predicted_label] return sentiment # Define the Gradio interface def gradio_sentiment_analysis(text): sentiment = classify_sentiment(text) return sentiment # Create the Gradio interface iface = gr.Interface( fn=gradio_sentiment_analysis, inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), outputs="text", title="Sentiment Analysis", description="Enter a sentence to classify its sentiment as Positive, Neutral, or Negative." ) # Launch the Gradio app iface.launch()