Spaces:
Sleeping
Sleeping
import streamlit as st | |
import tensorflow as tf | |
from transformers import BertTokenizer, TFBertForSequenceClassification | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
# Load the IMDb dataset | |
from datasets import load_dataset | |
# Load dataset | |
dataset = load_dataset("imdb") | |
# Split dataset into training and testing | |
train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2) | |
# Initialize the tokenizer | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
# Tokenization and padding | |
max_length = 128 | |
def tokenize_and_pad(text): | |
tokens = tokenizer.encode_plus( | |
text, | |
max_length=max_length, | |
padding='max_length', | |
truncation=True, | |
return_tensors='tf' | |
) | |
return tokens['input_ids'], tokens['attention_mask'] | |
# Preprocess the dataset | |
def preprocess_data(data): | |
input_ids = [] | |
attention_masks = [] | |
labels = [] | |
for review, label in zip(data['text'], data['label']): | |
ids, mask = tokenize_and_pad(review) | |
input_ids.append(ids) | |
attention_masks.append(mask) | |
labels.append(label) | |
return np.array(input_ids), np.array(attention_masks), np.array(labels) | |
X_train_ids, X_train_mask, y_train = preprocess_data(train_data) | |
X_test_ids, X_test_mask, y_test = preprocess_data(test_data) | |
# Load the pre-trained BERT model | |
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) | |
# Build the Keras model | |
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids") | |
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask") | |
bert_outputs = model(input_ids, attention_mask=attention_mask) | |
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits) | |
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs) | |
model.summary() | |
# Compile the model | |
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), | |
loss='binary_crossentropy', | |
metrics=['accuracy']) | |
# Train the model | |
history = model.fit( | |
[X_train_ids, X_train_mask], | |
y_train, | |
validation_split=0.1, | |
epochs=3, | |
batch_size=32 | |
) | |
# Evaluate the model | |
loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test) | |
st.write(f'Test Accuracy: {accuracy}') | |
# Plot training & validation accuracy values | |
st.subheader("Training and Validation Accuracy") | |
fig, ax = plt.subplots() | |
ax.plot(history.history['accuracy'], label='Training Accuracy') | |
ax.plot(history.history['val_accuracy'], label='Validation Accuracy') | |
ax.set_xlabel('Epoch') | |
ax.set_ylabel('Accuracy') | |
ax.legend() | |
st.pyplot(fig) | |
st.subheader("Training and Validation Loss") | |
fig, ax = plt.subplots() | |
ax.plot(history.history['loss'], label='Training Loss') | |
ax.plot(history.history['val_loss'], label='Validation Loss') | |
ax.set_xlabel('Epoch') | |
ax.set_ylabel('Loss') | |
ax.legend() | |
st.pyplot(fig) | |