Spaces:
Sleeping
Sleeping
eaglelandsonce
commited on
Update pages/21_NLP.py
Browse files- pages/21_NLP.py +40 -53
pages/21_NLP.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
import tensorflow as tf
|
3 |
-
from transformers import BertTokenizer, TFBertForSequenceClassification
|
4 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
5 |
import numpy as np
|
6 |
import matplotlib.pyplot as plt
|
@@ -15,67 +14,47 @@ dataset = load_dataset("imdb")
|
|
15 |
# Split dataset into training and testing
|
16 |
train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
# Tokenization and padding
|
22 |
max_length = 128
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
|
51 |
-
|
52 |
-
# Build the Keras model
|
53 |
-
input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
|
54 |
-
attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
|
55 |
-
|
56 |
-
bert_outputs = model(input_ids, attention_mask=attention_mask)
|
57 |
-
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)
|
58 |
-
|
59 |
-
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
|
60 |
|
61 |
model.summary()
|
62 |
|
63 |
# Compile the model
|
64 |
-
model.compile(optimizer=
|
65 |
-
loss='binary_crossentropy',
|
66 |
-
metrics=['accuracy'])
|
67 |
|
68 |
# Train the model
|
69 |
-
history = model.fit(
|
70 |
-
[X_train_ids, X_train_mask],
|
71 |
-
y_train,
|
72 |
-
validation_split=0.1,
|
73 |
-
epochs=3,
|
74 |
-
batch_size=32
|
75 |
-
)
|
76 |
|
77 |
# Evaluate the model
|
78 |
-
loss, accuracy = model.evaluate(
|
79 |
st.write(f'Test Accuracy: {accuracy}')
|
80 |
|
81 |
# Plot training & validation accuracy values
|
@@ -96,3 +75,11 @@ ax.set_xlabel('Epoch')
|
|
96 |
ax.set_ylabel('Loss')
|
97 |
ax.legend()
|
98 |
st.pyplot(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import tensorflow as tf
|
|
|
3 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
4 |
import numpy as np
|
5 |
import matplotlib.pyplot as plt
|
|
|
14 |
# Split dataset into training and testing
|
15 |
train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
|
16 |
|
17 |
+
# Tokenizer parameters
|
18 |
+
vocab_size = 10000
|
|
|
|
|
19 |
max_length = 128
|
20 |
+
embedding_dim = 128
|
21 |
+
|
22 |
+
# Tokenize the data
|
23 |
+
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
|
24 |
+
tokenizer.fit_on_texts(train_data['text'].values)
|
25 |
+
word_index = tokenizer.word_index
|
26 |
+
|
27 |
+
# Convert text to sequences
|
28 |
+
X_train = tokenizer.texts_to_sequences(train_data['text'].values)
|
29 |
+
X_test = tokenizer.texts_to_sequences(test_data['text'].values)
|
30 |
+
|
31 |
+
# Pad sequences
|
32 |
+
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
|
33 |
+
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')
|
34 |
+
|
35 |
+
# Labels
|
36 |
+
y_train = train_data['label'].values
|
37 |
+
y_test = test_data['label'].values
|
38 |
+
|
39 |
+
# Build the LSTM model
|
40 |
+
model = tf.keras.Sequential([
|
41 |
+
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
|
42 |
+
tf.keras.layers.LSTM(64, return_sequences=True),
|
43 |
+
tf.keras.layers.LSTM(32),
|
44 |
+
tf.keras.layers.Dense(24, activation='relu'),
|
45 |
+
tf.keras.layers.Dense(1, activation='sigmoid')
|
46 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
model.summary()
|
49 |
|
50 |
# Compile the model
|
51 |
+
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
|
|
52 |
|
53 |
# Train the model
|
54 |
+
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1, batch_size=32)
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# Evaluate the model
|
57 |
+
loss, accuracy = model.evaluate(X_test, y_test)
|
58 |
st.write(f'Test Accuracy: {accuracy}')
|
59 |
|
60 |
# Plot training & validation accuracy values
|
|
|
75 |
ax.set_ylabel('Loss')
|
76 |
ax.legend()
|
77 |
st.pyplot(fig)
|
78 |
+
|
79 |
+
# Convert the model to TensorFlow.js format
|
80 |
+
import tensorflowjs as tfjs
|
81 |
+
|
82 |
+
tfjs_target_dir = 'tfjs_model'
|
83 |
+
model.save('model.h5')
|
84 |
+
tfjs.converters.save_keras_model(model, tfjs_target_dir)
|
85 |
+
st.write("Model saved and converted to TensorFlow.js format.")
|