import streamlit as st import tensorflow as tf from transformers import BertTokenizer, TFBertForSequenceClassification from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # Load the IMDb dataset from datasets import load_dataset # Load dataset dataset = load_dataset("imdb") # Split dataset into training and testing train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2) # Initialize the tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenization and padding max_length = 128 def tokenize_and_pad(text): tokens = tokenizer.encode_plus( text, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf' ) return tokens['input_ids'], tokens['attention_mask'] # Preprocess the dataset def preprocess_data(data): input_ids = [] attention_masks = [] labels = [] for review, label in zip(data['text'], data['label']): ids, mask = tokenize_and_pad(review) input_ids.append(ids) attention_masks.append(mask) labels.append(label) return np.array(input_ids), np.array(attention_masks), np.array(labels) X_train_ids, X_train_mask, y_train = preprocess_data(train_data) X_test_ids, X_test_mask, y_test = preprocess_data(test_data) # Load the pre-trained BERT model model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Build the Keras model input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids") attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask") bert_outputs = model(input_ids, attention_mask=attention_mask) outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits) model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs) model.summary() # Compile the model model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), loss='binary_crossentropy', metrics=['accuracy']) # Train the model history = model.fit( [X_train_ids, X_train_mask], y_train, validation_split=0.1, epochs=3, batch_size=32 ) # Evaluate the model loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test) st.write(f'Test Accuracy: {accuracy}') # Plot training & validation accuracy values st.subheader("Training and Validation Accuracy") fig, ax = plt.subplots() ax.plot(history.history['accuracy'], label='Training Accuracy') ax.plot(history.history['val_accuracy'], label='Validation Accuracy') ax.set_xlabel('Epoch') ax.set_ylabel('Accuracy') ax.legend() st.pyplot(fig) st.subheader("Training and Validation Loss") fig, ax = plt.subplots() ax.plot(history.history['loss'], label='Training Loss') ax.plot(history.history['val_loss'], label='Validation Loss') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') ax.legend() st.pyplot(fig)