import streamlit as st
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load the IMDb dataset
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")

# Split dataset into training and testing
train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and padding
max_length = 128

def tokenize_and_pad(text):
    tokens = tokenizer.encode_plus(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )
    return tokens['input_ids'], tokens['attention_mask']

# Preprocess the dataset
def preprocess_data(data):
    input_ids = []
    attention_masks = []
    labels = []
    for review, label in zip(data['text'], data['label']):
        ids, mask = tokenize_and_pad(review)
        input_ids.append(ids)
        attention_masks.append(mask)
        labels.append(label)
    return np.array(input_ids), np.array(attention_masks), np.array(labels)

X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
X_test_ids, X_test_mask, y_test = preprocess_data(test_data)

# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Build the Keras model
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

bert_outputs = model(input_ids, attention_mask=attention_mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)

model.summary()

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(
    [X_train_ids, X_train_mask],
    y_train,
    validation_split=0.1,
    epochs=3,
    batch_size=32
)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
st.write(f'Test Accuracy: {accuracy}')

# Plot training & validation accuracy values
st.subheader("Training and Validation Accuracy")
fig, ax = plt.subplots()
ax.plot(history.history['accuracy'], label='Training Accuracy')
ax.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.legend()
st.pyplot(fig)

st.subheader("Training and Validation Loss")
fig, ax = plt.subplots()
ax.plot(history.history['loss'], label='Training Loss')
ax.plot(history.history['val_loss'], label='Validation Loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
st.pyplot(fig)