TensorFlowClass / pages /21_NLP.py
eaglelandsonce's picture
Create 21_NLP.py
79ac5ce verified
raw
history blame
3.03 kB
import streamlit as st
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# Load the IMDb dataset
from datasets import load_dataset
# Load dataset
dataset = load_dataset("imdb")
# Split dataset into training and testing
train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenization and padding
max_length = 128
def tokenize_and_pad(text):
tokens = tokenizer.encode_plus(
text,
max_length=max_length,
padding='max_length',
truncation=True,
return_tensors='tf'
)
return tokens['input_ids'], tokens['attention_mask']
# Preprocess the dataset
def preprocess_data(data):
input_ids = []
attention_masks = []
labels = []
for review, label in zip(data['text'], data['label']):
ids, mask = tokenize_and_pad(review)
input_ids.append(ids)
attention_masks.append(mask)
labels.append(label)
return np.array(input_ids), np.array(attention_masks), np.array(labels)
X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
X_test_ids, X_test_mask, y_test = preprocess_data(test_data)
# Load the pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Build the Keras model
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
bert_outputs = model(input_ids, attention_mask=attention_mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
model.summary()
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
loss='binary_crossentropy',
metrics=['accuracy'])
# Train the model
history = model.fit(
[X_train_ids, X_train_mask],
y_train,
validation_split=0.1,
epochs=3,
batch_size=32
)
# Evaluate the model
loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
st.write(f'Test Accuracy: {accuracy}')
# Plot training & validation accuracy values
st.subheader("Training and Validation Accuracy")
fig, ax = plt.subplots()
ax.plot(history.history['accuracy'], label='Training Accuracy')
ax.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.legend()
st.pyplot(fig)
st.subheader("Training and Validation Loss")
fig, ax = plt.subplots()
ax.plot(history.history['loss'], label='Training Loss')
ax.plot(history.history['val_loss'], label='Validation Loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
st.pyplot(fig)