Spaces:

dp92
/

Toxiclassifier

Build error

App Files Files Community

dp92 commited on Apr 26, 2023

Commit

7e3e24e

1 Parent(s): 966eeae

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -38

app.py CHANGED Viewed

@@ -1,38 +1,56 @@
-import gradio as gr
-import tensorflow as tf
-from tensorflow.keras.layers import TextVectorization
-# Load the saved model
-model = tf.keras.models.load_model('toxicity.h5')
-# Columns names
-columns = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
-# Specifying the maximum number of words in the vocabulary
-MAX_FEATURES=200000
-# Creating a TextVectorization layer with the specified parameters
-vectorizer = TextVectorization(max_tokens=MAX_FEATURES,output_sequence_length=1800,output_mode="int")
-# Define a function to score a comment
-def score_comment(comment):
-  # Vectorize the comment using the vectorizer
-  vectorized_comment = vectorizer([comment])
-  # Get the prediction results from the model
-  results = model.predict(vectorized_comment)
-  # Create a string to return the prediction results for each class
-  text = ''
-  for idx,col in enumerate(columns):
-    text += '{}: {}\n'.format(col,results[0][idx]>0.5)
-  return text
-# Create a Gradio interface for the score_comment function
-interface = gr.Interface(fn=score_comment,inputs=gr.inputs.Textbox(lines=2,placeholder="Comment to score"),outputs="text",title='Comment Toxicity Classifier')
-# Launch the Gradio interface
-interface.launch()

+!pip install transformers
+import pandas as pd
+import torch
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+# Load the pre-trained BERT model and tokenizer
+model_name = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
+# Load the train and test data
+train_data = pd.read_csv("train.csv")
+test_data = pd.read_csv("test.csv")
+# Define the function to preprocess the text
+def preprocess(text):
+    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
+    return inputs["input_ids"], inputs["attention_mask"]
+# Preprocess the train and test data
+X_train = train_data["comment_text"].tolist()
+y_train = train_data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values.tolist()
+train_encodings = tokenizer(X_train, padding=True, truncation=True, max_length=128, return_tensors="pt")
+train_dataset = torch.utils.data.TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], torch.tensor(y_train))
+X_test = test_data["comment_text"].tolist()
+test_encodings = tokenizer(X_test, padding=True, truncation=True, max_length=128, return_tensors="pt")
+test_dataset = torch.utils.data.TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"])
+# Define the training arguments
+training_args = TrainingArguments(
+    output_dir='./results',
+    evaluation_strategy="epoch",
+    num_train_epochs=3,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    logging_dir='./logs',
+)
+# Define the trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=test_dataset,
+)
+# Train the model
+trainer.train()
+# Evaluate the model
+eval_results = trainer.evaluate()
+# Print the evaluation results
+print(eval_results)