Upload stress_categorization_using_bert_transformer.py
Browse files
stress_categorization_using_bert_transformer.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Stress Categorization Using BERT Transformer.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1JZTLCAUBN6XkcQpAWukUsx7dJ5VyC_KR
|
8 |
+
"""
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import pandas as pd
|
12 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
13 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
14 |
+
from tensorflow.keras.models import Sequential
|
15 |
+
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout
|
16 |
+
from tensorflow.keras.utils import to_categorical
|
17 |
+
from sklearn.model_selection import train_test_split
|
18 |
+
from sklearn.preprocessing import LabelEncoder
|
19 |
+
|
20 |
+
!pip install transformers
|
21 |
+
|
22 |
+
import pandas as pd
|
23 |
+
import re
|
24 |
+
from sklearn.model_selection import train_test_split
|
25 |
+
from transformers import BertTokenizer, TFBertForSequenceClassification
|
26 |
+
from transformers import InputExample, InputFeatures
|
27 |
+
import tensorflow as tf
|
28 |
+
|
29 |
+
# 1. Load and inspect the data
|
30 |
+
data = pd.read_excel('stress_data.xlsx')
|
31 |
+
|
32 |
+
# 2. Clean and preprocess the data
|
33 |
+
def clean_text(text):
|
34 |
+
text = text.lower()
|
35 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
36 |
+
text = re.sub(r'\d+|\W+', ' ', text)
|
37 |
+
return text
|
38 |
+
|
39 |
+
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
|
40 |
+
|
41 |
+
# Convert string labels to integer indices
|
42 |
+
label_encoder = LabelEncoder()
|
43 |
+
|
44 |
+
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
|
45 |
+
|
46 |
+
# 3. Tokenize data using BERT's tokenizer
|
47 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
48 |
+
|
49 |
+
# Split the data into train and test
|
50 |
+
train, test = train_test_split(data, test_size=0.2, random_state=42)
|
51 |
+
|
52 |
+
# Convert data to InputExample format
|
53 |
+
def convert_data_to_input_example(data):
|
54 |
+
return data.apply(lambda x: InputExample(guid=None, text_a=x['Cleaned_Posts'], text_b=None, label=x['LabelIndices']), axis=1)
|
55 |
+
|
56 |
+
train_InputExamples = convert_data_to_input_example(train)
|
57 |
+
test_InputExamples = convert_data_to_input_example(test)
|
58 |
+
|
59 |
+
# Convert to features for BERT input
|
60 |
+
def convert_input_example_to_feature(example):
|
61 |
+
return tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False)
|
62 |
+
|
63 |
+
train_features = train_InputExamples.apply(convert_input_example_to_feature)
|
64 |
+
test_features = test_InputExamples.apply(convert_input_example_to_feature)
|
65 |
+
|
66 |
+
# Convert features to tensorflow dataset
|
67 |
+
def convert_features_to_tf_dataset(features, labels):
|
68 |
+
def gen():
|
69 |
+
for f, l in zip(features, labels):
|
70 |
+
yield ({'input_ids': f['input_ids'], 'attention_mask': f['attention_mask']}, l)
|
71 |
+
return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))
|
72 |
+
|
73 |
+
def decode_predictions(predictions):
|
74 |
+
# Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
|
75 |
+
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
|
76 |
+
# Decode the indices to original labels
|
77 |
+
decoded_labels = label_encoder.inverse_transform(predicted_indices)
|
78 |
+
return decoded_labels
|
79 |
+
|
80 |
+
|
81 |
+
train_dataset = convert_features_to_tf_dataset(train_features, train['LabelIndices']).shuffle(100).batch(32).repeat(2)
|
82 |
+
test_dataset = convert_features_to_tf_dataset(test_features, test['LabelIndices']).batch(32)
|
83 |
+
|
84 |
+
# 4. Fine-tune BERT on the dataset
|
85 |
+
model_new = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data['Labels'].unique()))
|
86 |
+
model_new.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
|
87 |
+
model_new.fit(train_dataset, epochs=1, validation_data=test_dataset)
|
88 |
+
decode_predictions(model_new.predict("I am financially broken"))
|
89 |
+
|
90 |
+
# 5. Evaluate the model
|
91 |
+
loss, accuracy = model_new.evaluate(test_dataset)
|
92 |
+
print(f"Test accuracy: {accuracy}")
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
"""# New Section"""
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
pip install transformers
|
103 |
+
|
104 |
+
import pandas as pd
|
105 |
+
import re
|
106 |
+
from sklearn.model_selection import train_test_split
|
107 |
+
from transformers import BertTokenizer, TFBertForSequenceClassification
|
108 |
+
from transformers import InputExample, InputFeatures
|
109 |
+
import tensorflow as tf
|
110 |
+
|
111 |
+
# 1. Load and inspect the data
|
112 |
+
data = pd.read_excel('stress_data.xlsx')
|
113 |
+
|
114 |
+
# 2. Clean and preprocess the data
|
115 |
+
def clean_text(text):
|
116 |
+
text = text.lower()
|
117 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
118 |
+
text = re.sub(r'\d+|\W+', ' ', text)
|
119 |
+
return text
|
120 |
+
|
121 |
+
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
|
122 |
+
|
123 |
+
# Convert string labels to integer indices
|
124 |
+
label_encoder = LabelEncoder()
|
125 |
+
|
126 |
+
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
|
127 |
+
|
128 |
+
# 3. Tokenize data using BERT's tokenizer
|
129 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
|
130 |
+
|
131 |
+
tokenizer
|
132 |
+
|
133 |
+
data
|
134 |
+
|
135 |
+
import pandas as pd
|
136 |
+
import re
|
137 |
+
from sklearn.model_selection import train_test_split
|
138 |
+
from sklearn.preprocessing import LabelEncoder
|
139 |
+
from transformers import BertTokenizer, TFBertForSequenceClassification
|
140 |
+
from transformers import InputExample, InputFeatures
|
141 |
+
import tensorflow as tf
|
142 |
+
|
143 |
+
# 1. Load and inspect the data
|
144 |
+
data = pd.read_excel('stress_data.xlsx')
|
145 |
+
|
146 |
+
# 2. Clean and preprocess the data
|
147 |
+
def clean_text(text):
|
148 |
+
text = text.lower()
|
149 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
150 |
+
text = re.sub(r'\d+|\W+', ' ', text)
|
151 |
+
return text
|
152 |
+
|
153 |
+
data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
|
154 |
+
|
155 |
+
# Convert string labels to integer indices
|
156 |
+
label_encoder = LabelEncoder()
|
157 |
+
|
158 |
+
data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
|
159 |
+
|
160 |
+
data["Labels"]
|
161 |
+
|
162 |
+
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
|
163 |
+
print(le_name_mapping)
|
164 |
+
|
165 |
+
import joblib
|
166 |
+
|
167 |
+
# Assuming 'label_encoder' is your LabelEncoder instance
|
168 |
+
joblib.dump(label_encoder, 'label_encoder.joblib')
|
169 |
+
|
170 |
+
label_encoder = joblib.load("/content/label_encoder.joblib")
|
171 |
+
|
172 |
+
def decode_predictions(predictions):
|
173 |
+
# Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
|
174 |
+
predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
|
175 |
+
# Decode the indices to original labels
|
176 |
+
decoded_labels = label_encoder.inverse_transform(predicted_indices)
|
177 |
+
return decoded_labels
|
178 |
+
|
179 |
+
# Use a pipeline as a high-level helper
|
180 |
+
from transformers import pipeline
|
181 |
+
|
182 |
+
pipe = pipeline("text-classification", model="NeuEraAI/Stress_Classifier_BERT")
|
183 |
+
|
184 |
+
decode_predictions(pipe.predict("I am in huge debts. I have taken huge loans and I can't repay."))
|
185 |
+
|