eaglelandsonce commited on
Commit
08cf096
·
verified ·
1 Parent(s): 79ac5ce

Update pages/21_NLP.py

Browse files
Files changed (1) hide show
  1. pages/21_NLP.py +40 -53
pages/21_NLP.py CHANGED
@@ -1,6 +1,5 @@
1
  import streamlit as st
2
  import tensorflow as tf
3
- from transformers import BertTokenizer, TFBertForSequenceClassification
4
  from tensorflow.keras.preprocessing.sequence import pad_sequences
5
  import numpy as np
6
  import matplotlib.pyplot as plt
@@ -15,67 +14,47 @@ dataset = load_dataset("imdb")
15
  # Split dataset into training and testing
16
  train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
17
 
18
- # Initialize the tokenizer
19
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
20
-
21
- # Tokenization and padding
22
  max_length = 128
23
-
24
- def tokenize_and_pad(text):
25
- tokens = tokenizer.encode_plus(
26
- text,
27
- max_length=max_length,
28
- padding='max_length',
29
- truncation=True,
30
- return_tensors='tf'
31
- )
32
- return tokens['input_ids'], tokens['attention_mask']
33
-
34
- # Preprocess the dataset
35
- def preprocess_data(data):
36
- input_ids = []
37
- attention_masks = []
38
- labels = []
39
- for review, label in zip(data['text'], data['label']):
40
- ids, mask = tokenize_and_pad(review)
41
- input_ids.append(ids)
42
- attention_masks.append(mask)
43
- labels.append(label)
44
- return np.array(input_ids), np.array(attention_masks), np.array(labels)
45
-
46
- X_train_ids, X_train_mask, y_train = preprocess_data(train_data)
47
- X_test_ids, X_test_mask, y_test = preprocess_data(test_data)
48
-
49
- # Load the pre-trained BERT model
50
- model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
51
-
52
- # Build the Keras model
53
- input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
54
- attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
55
-
56
- bert_outputs = model(input_ids, attention_mask=attention_mask)
57
- outputs = tf.keras.layers.Dense(1, activation='sigmoid')(bert_outputs.logits)
58
-
59
- model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
60
 
61
  model.summary()
62
 
63
  # Compile the model
64
- model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
65
- loss='binary_crossentropy',
66
- metrics=['accuracy'])
67
 
68
  # Train the model
69
- history = model.fit(
70
- [X_train_ids, X_train_mask],
71
- y_train,
72
- validation_split=0.1,
73
- epochs=3,
74
- batch_size=32
75
- )
76
 
77
  # Evaluate the model
78
- loss, accuracy = model.evaluate([X_test_ids, X_test_mask], y_test)
79
  st.write(f'Test Accuracy: {accuracy}')
80
 
81
  # Plot training & validation accuracy values
@@ -96,3 +75,11 @@ ax.set_xlabel('Epoch')
96
  ax.set_ylabel('Loss')
97
  ax.legend()
98
  st.pyplot(fig)
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import tensorflow as tf
 
3
  from tensorflow.keras.preprocessing.sequence import pad_sequences
4
  import numpy as np
5
  import matplotlib.pyplot as plt
 
14
  # Split dataset into training and testing
15
  train_data, test_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)
16
 
17
+ # Tokenizer parameters
18
+ vocab_size = 10000
 
 
19
  max_length = 128
20
+ embedding_dim = 128
21
+
22
+ # Tokenize the data
23
+ tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
24
+ tokenizer.fit_on_texts(train_data['text'].values)
25
+ word_index = tokenizer.word_index
26
+
27
+ # Convert text to sequences
28
+ X_train = tokenizer.texts_to_sequences(train_data['text'].values)
29
+ X_test = tokenizer.texts_to_sequences(test_data['text'].values)
30
+
31
+ # Pad sequences
32
+ X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
33
+ X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')
34
+
35
+ # Labels
36
+ y_train = train_data['label'].values
37
+ y_test = test_data['label'].values
38
+
39
+ # Build the LSTM model
40
+ model = tf.keras.Sequential([
41
+ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
42
+ tf.keras.layers.LSTM(64, return_sequences=True),
43
+ tf.keras.layers.LSTM(32),
44
+ tf.keras.layers.Dense(24, activation='relu'),
45
+ tf.keras.layers.Dense(1, activation='sigmoid')
46
+ ])
 
 
 
 
 
 
 
 
 
 
47
 
48
  model.summary()
49
 
50
  # Compile the model
51
+ model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 
 
52
 
53
  # Train the model
54
+ history = model.fit(X_train, y_train, epochs=3, validation_split=0.1, batch_size=32)
 
 
 
 
 
 
55
 
56
  # Evaluate the model
57
+ loss, accuracy = model.evaluate(X_test, y_test)
58
  st.write(f'Test Accuracy: {accuracy}')
59
 
60
  # Plot training & validation accuracy values
 
75
  ax.set_ylabel('Loss')
76
  ax.legend()
77
  st.pyplot(fig)
78
+
79
+ # Convert the model to TensorFlow.js format
80
+ import tensorflowjs as tfjs
81
+
82
+ tfjs_target_dir = 'tfjs_model'
83
+ model.save('model.h5')
84
+ tfjs.converters.save_keras_model(model, tfjs_target_dir)
85
+ st.write("Model saved and converted to TensorFlow.js format.")