Upload stress_categorization_using_bert_transformer.py

Browse files

Files changed (1) hide show

stress_categorization_using_bert_transformer.py +185 -0

stress_categorization_using_bert_transformer.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# -*- coding: utf-8 -*-
+"""Stress Categorization Using BERT Transformer.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1JZTLCAUBN6XkcQpAWukUsx7dJ5VyC_KR
+"""
+import numpy as np
+import pandas as pd
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout
+from tensorflow.keras.utils import to_categorical
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+!pip install transformers
+import pandas as pd
+import re
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, TFBertForSequenceClassification
+from transformers import InputExample, InputFeatures
+import tensorflow as tf
+# 1. Load and inspect the data
+data = pd.read_excel('stress_data.xlsx')
+# 2. Clean and preprocess the data
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\d+|\W+', ' ', text)
+    return text
+data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
+# Convert string labels to integer indices
+label_encoder = LabelEncoder()
+data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
+# 3. Tokenize data using BERT's tokenizer
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+# Split the data into train and test
+train, test = train_test_split(data, test_size=0.2, random_state=42)
+# Convert data to InputExample format
+def convert_data_to_input_example(data):
+    return data.apply(lambda x: InputExample(guid=None, text_a=x['Cleaned_Posts'], text_b=None, label=x['LabelIndices']), axis=1)
+train_InputExamples = convert_data_to_input_example(train)
+test_InputExamples = convert_data_to_input_example(test)
+# Convert to features for BERT input
+def convert_input_example_to_feature(example):
+    return tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=128, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False)
+train_features = train_InputExamples.apply(convert_input_example_to_feature)
+test_features = test_InputExamples.apply(convert_input_example_to_feature)
+# Convert features to tensorflow dataset
+def convert_features_to_tf_dataset(features, labels):
+    def gen():
+        for f, l in zip(features, labels):
+            yield ({'input_ids': f['input_ids'], 'attention_mask': f['attention_mask']}, l)
+    return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))
+def decode_predictions(predictions):
+    # Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
+    predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
+    # Decode the indices to original labels
+    decoded_labels = label_encoder.inverse_transform(predicted_indices)
+    return decoded_labels
+train_dataset = convert_features_to_tf_dataset(train_features, train['LabelIndices']).shuffle(100).batch(32).repeat(2)
+test_dataset = convert_features_to_tf_dataset(test_features, test['LabelIndices']).batch(32)
+# 4. Fine-tune BERT on the dataset
+model_new = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(data['Labels'].unique()))
+model_new.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
+model_new.fit(train_dataset, epochs=1, validation_data=test_dataset)
+decode_predictions(model_new.predict("I am financially broken"))
+# 5. Evaluate the model
+loss, accuracy = model_new.evaluate(test_dataset)
+print(f"Test accuracy: {accuracy}")
+"""# New Section"""
+pip install transformers
+import pandas as pd
+import re
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, TFBertForSequenceClassification
+from transformers import InputExample, InputFeatures
+import tensorflow as tf
+# 1. Load and inspect the data
+data = pd.read_excel('stress_data.xlsx')
+# 2. Clean and preprocess the data
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\d+|\W+', ' ', text)
+    return text
+data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
+# Convert string labels to integer indices
+label_encoder = LabelEncoder()
+data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
+# 3. Tokenize data using BERT's tokenizer
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+tokenizer
+data
+import pandas as pd
+import re
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from transformers import BertTokenizer, TFBertForSequenceClassification
+from transformers import InputExample, InputFeatures
+import tensorflow as tf
+# 1. Load and inspect the data
+data = pd.read_excel('stress_data.xlsx')
+# 2. Clean and preprocess the data
+def clean_text(text):
+    text = text.lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'\d+|\W+', ' ', text)
+    return text
+data['Cleaned_Posts'] = data['Posts'].apply(clean_text)
+# Convert string labels to integer indices
+label_encoder = LabelEncoder()
+data['LabelIndices'] = label_encoder.fit_transform(data['Labels'])
+data["Labels"]
+le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
+print(le_name_mapping)
+import joblib
+# Assuming 'label_encoder' is your LabelEncoder instance
+joblib.dump(label_encoder, 'label_encoder.joblib')
+label_encoder = joblib.load("/content/label_encoder.joblib")
+def decode_predictions(predictions):
+    # Extract predicted indices (assuming predictions is a list of dicts with 'label' keys)
+    predicted_indices = [int(pred['label'].split('_')[-1]) for pred in predictions]
+    # Decode the indices to original labels
+    decoded_labels = label_encoder.inverse_transform(predicted_indices)
+    return decoded_labels
+# Use a pipeline as a high-level helper
+from transformers import pipeline
+pipe = pipeline("text-classification", model="NeuEraAI/Stress_Classifier_BERT")
+decode_predictions(pipe.predict("I am in huge debts. I have taken huge loans and I can't repay."))