Spaces:

eaglelandsonce
/

TensorFlowClass

Sleeping

App Files Files Community

TensorFlowClass / pages /21_NLP.py

eaglelandsonce

Create 21_NLP.py

79ac5ce verified 11 months ago

raw

history blame

3.03 kB

	import streamlit as st
	import tensorflow as tf
	from transformers import BertTokenizer, TFBertForSequenceClassification
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split

	# Load the IMDb dataset
	from datasets import load_dataset

	# Load dataset
	dataset = load_dataset("imdb")

	# Split dataset into training and testing
	train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)

	# Initialize the tokenizer
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

	# Tokenization and padding
	max_length = 128

	def tokenize_and_pad(text):
	tokens = tokenizer.encode_plus(
	text,
	max_length=max_length,
	padding='max_length',
	truncation=True,
	return_tensors='tf'
	)
	return tokens['input_ids'], tokens['attention_mask']

	# Preprocess the dataset
	def preprocess_data(data):
	input_ids = []
	attention_masks = []
	labels = []
	for review, label in zip(data['text'], data['label']):
	ids, mask = tokenize_and_pad(review)
	input_ids.append(ids)
	attention_masks.append(mask)
	labels.append(label)
	return np.array(input_ids), np.array(attention_masks), np.array(labels)

	X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
	X_test_ids, X_test_mask, y_test = preprocess_data(test_data)

	# Load the pre-trained BERT model
	model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

	# Build the Keras model
	input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
	attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

	bert_outputs = model(input_ids, attention_mask=attention_mask)
	outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)

	model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)

	model.summary()

	# Compile the model
	model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
	loss='binary_crossentropy',
	metrics=['accuracy'])

	# Train the model
	history = model.fit(
	[X_train_ids, X_train_mask],
	y_train,
	validation_split=0.1,
	epochs=3,
	batch_size=32
	)

	# Evaluate the model
	loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
	st.write(f'Test Accuracy: {accuracy}')

	# Plot training & validation accuracy values
	st.subheader("Training and Validation Accuracy")
	fig, ax = plt.subplots()
	ax.plot(history.history['accuracy'], label='Training Accuracy')
	ax.plot(history.history['val_accuracy'], label='Validation Accuracy')
	ax.set_xlabel('Epoch')
	ax.set_ylabel('Accuracy')
	ax.legend()
	st.pyplot(fig)

	st.subheader("Training and Validation Loss")
	fig, ax = plt.subplots()
	ax.plot(history.history['loss'], label='Training Loss')
	ax.plot(history.history['val_loss'], label='Validation Loss')
	ax.set_xlabel('Epoch')
	ax.set_ylabel('Loss')
	ax.legend()
	st.pyplot(fig)