File size: 1,260 Bytes
3630984
6176775
4c30733
6176775
4c30733
6176775
 
4c30733
 
 
6176775
 
4c30733
6176775
4c30733
6176775
 
4c30733
6176775
4c30733
 
 
 
230173f
 
4c30733
6176775
 
 
 
 
 
24e7e9e
6176775
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

# Load the data
data = pd.read_csv("toxic_comments.csv")

# Define the function to preprocess the text
def preprocess(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    return inputs["input_ids"], inputs["attention_mask"]

# Define the function to classify a text input
def classify(text):
    input_ids, attention_mask = preprocess(text)
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
    preds = torch.sigmoid(logits).squeeze().tolist()
    return {labels[i]: preds[i] for i in range(len(labels))}

# Define the labels
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Classify the comments and print the results
for i, row in data.iterrows():
    text = row["comment_text"]

    print("Comment: ", text)
    print("Predictions: ", preds)
    print("Labels: ", row[labels].to_dict())