dp92 commited on
Commit
6176775
·
1 Parent(s): 3fca404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -41
app.py CHANGED
@@ -1,51 +1,22 @@
 
 
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
 
4
- # Load the pretrained BERT model and tokenizer
5
- model_name = 'bert-base-uncased'
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
8
 
9
- # Define the training data and labels
10
- train_texts = [...] # List of training text inputs
11
- train_labels = [...] # List of training labels (one-hot encoded)
12
 
13
- # Define a function to preprocess the text input
14
  def preprocess(text):
15
- inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
16
- return inputs['input_ids'], inputs['attention_mask']
17
 
18
- # Define a function to encode the labels as one-hot vectors
19
- def encode_labels(labels):
20
- return torch.tensor(labels, dtype=torch.float)
21
-
22
- # Define the training data and labels as PyTorch tensors
23
- train_inputs = [preprocess(text) for text in train_texts]
24
- train_labels = encode_labels(train_labels)
25
-
26
- # Define the training arguments
27
- training_args = TrainingArguments(
28
- output_dir='./results',
29
- num_train_epochs=3,
30
- per_device_train_batch_size=32,
31
- per_device_eval_batch_size=64,
32
- warmup_steps=500,
33
- weight_decay=0.01,
34
- logging_dir='./logs',
35
- logging_steps=10
36
- )
37
-
38
- # Define the trainer object
39
- trainer = Trainer(
40
- model=model,
41
- args=training_args,
42
- train_dataset=list(zip(train_inputs, train_labels))
43
- )
44
-
45
- # Train the model
46
- trainer.train()
47
-
48
- # Define a function to classify a text input
49
  def classify(text):
50
  input_ids, attention_mask = preprocess(text)
51
  with torch.no_grad():
@@ -53,4 +24,13 @@ def classify(text):
53
  preds = torch.sigmoid(logits).squeeze().tolist()
54
  return {labels[i]: preds[i] for i in range(len(labels))}
55
 
56
- # Example usage
 
 
 
 
 
 
 
 
 
 
1
+ pip install transformers
2
+ import pandas as pd
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
 
6
+ # Load the pre-trained BERT model and tokenizer
7
+ model_name = "bert-base-uncased"
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
10
 
11
+ # Load the data
12
+ data = pd.read_csv("toxic_comments.csv")
 
13
 
14
+ # Define the function to preprocess the text
15
  def preprocess(text):
16
+ inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
17
+ return inputs["input_ids"], inputs["attention_mask"]
18
 
19
+ # Define the function to classify a text input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def classify(text):
21
  input_ids, attention_mask = preprocess(text)
22
  with torch.no_grad():
 
24
  preds = torch.sigmoid(logits).squeeze().tolist()
25
  return {labels[i]: preds[i] for i in range(len(labels))}
26
 
27
+ # Define the labels
28
+ labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
29
+
30
+ # Classify the comments and print the results
31
+ for i, row in data.iterrows():
32
+ text = row["comment_text"]
33
+ preds = classify(text)
34
+ print("Comment: ", text)
35
+ print("Predictions: ", preds)
36
+ print("Labels: ", row[labels].to_dict())