dp92 commited on
Commit
7e3e24e
·
1 Parent(s): 966eeae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -38
app.py CHANGED
@@ -1,38 +1,56 @@
1
- import gradio as gr
2
- import tensorflow as tf
3
- from tensorflow.keras.layers import TextVectorization
4
-
5
-
6
- # Load the saved model
7
- model = tf.keras.models.load_model('toxicity.h5')
8
-
9
- # Columns names
10
- columns = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
11
-
12
- # Specifying the maximum number of words in the vocabulary
13
- MAX_FEATURES=200000
14
-
15
- # Creating a TextVectorization layer with the specified parameters
16
- vectorizer = TextVectorization(max_tokens=MAX_FEATURES,output_sequence_length=1800,output_mode="int")
17
-
18
- # Define a function to score a comment
19
- def score_comment(comment):
20
- # Vectorize the comment using the vectorizer
21
- vectorized_comment = vectorizer([comment])
22
-
23
- # Get the prediction results from the model
24
- results = model.predict(vectorized_comment)
25
-
26
- # Create a string to return the prediction results for each class
27
- text = ''
28
- for idx,col in enumerate(columns):
29
- text += '{}: {}\n'.format(col,results[0][idx]>0.5)
30
-
31
- return text
32
-
33
-
34
- # Create a Gradio interface for the score_comment function
35
- interface = gr.Interface(fn=score_comment,inputs=gr.inputs.Textbox(lines=2,placeholder="Comment to score"),outputs="text",title='Comment Toxicity Classifier')
36
-
37
- # Launch the Gradio interface
38
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install transformers
2
+ import pandas as pd
3
+ import torch
4
+ from sklearn.model_selection import train_test_split
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
6
+
7
+ # Load the pre-trained BERT model and tokenizer
8
+ model_name = "bert-base-uncased"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
11
+
12
+ # Load the train and test data
13
+ train_data = pd.read_csv("train.csv")
14
+ test_data = pd.read_csv("test.csv")
15
+
16
+ # Define the function to preprocess the text
17
+ def preprocess(text):
18
+ inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
19
+ return inputs["input_ids"], inputs["attention_mask"]
20
+
21
+ # Preprocess the train and test data
22
+ X_train = train_data["comment_text"].tolist()
23
+ y_train = train_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
24
+ train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length=128, return_tensors="pt")
25
+ train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(y_train))
26
+
27
+ X_test = test_data["comment_text"].tolist()
28
+ test_encodings = tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors="pt")
29
+ test_dataset = torch.utils.data.TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"])
30
+
31
+ # Define the training arguments
32
+ training_args = TrainingArguments(
33
+ output_dir='./results',
34
+ evaluation_strategy="epoch",
35
+ num_train_epochs=3,
36
+ per_device_train_batch_size=16,
37
+ per_device_eval_batch_size=64,
38
+ logging_dir='./logs',
39
+ )
40
+
41
+ # Define the trainer
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=training_args,
45
+ train_dataset=train_dataset,
46
+ eval_dataset=test_dataset,
47
+ )
48
+
49
+ # Train the model
50
+ trainer.train()
51
+
52
+ # Evaluate the model
53
+ eval_results = trainer.evaluate()
54
+
55
+ # Print the evaluation results
56
+ print(eval_results)