dp92 commited on
Commit
054e256
·
1 Parent(s): 8006e22

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -12
app.py CHANGED
@@ -1,21 +1,53 @@
 
 
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
 
4
  # Load the pretrained BERT model and tokenizer
5
  model_name = 'bert-base-uncased'
6
  tokenizer = AutoTokenizer.from_pretrained(model_name)
7
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
8
 
9
- # Define the labels and their corresponding indices
10
- labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
11
- label2id = {label: i for i, label in enumerate(labels)}
12
 
13
  # Define a function to preprocess the text input
14
  def preprocess(text):
15
  inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
16
  return inputs['input_ids'], inputs['attention_mask']
17
 
18
- # Define a function to classify a text input and return the predicted categories with probabilities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def classify(text):
20
  input_ids, attention_mask = preprocess(text)
21
  with torch.no_grad():
@@ -23,11 +55,7 @@ def classify(text):
23
  preds = torch.sigmoid(logits).squeeze().tolist()
24
  return {labels[i]: preds[i] for i in range(len(labels))}
25
 
26
- # Prompt the user to input text and classify it
27
- text = input("Enter text to check toxicity: ")
28
  preds = classify(text)
29
-
30
- # Print the predicted categories with probabilities
31
- print("Predicted toxicity categories and probabilities:")
32
- for label, prob in preds.items():
33
- print(f"{label}: {prob:.2f}")
 
1
+ !pip install transformers
2
+
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
 
6
  # Load the pretrained BERT model and tokenizer
7
  model_name = 'bert-base-uncased'
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
10
 
11
+ # Define the training data and labels
12
+ train_texts = [...] # List of training text inputs
13
+ train_labels = [...] # List of training labels (one-hot encoded)
14
 
15
  # Define a function to preprocess the text input
16
  def preprocess(text):
17
  inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
18
  return inputs['input_ids'], inputs['attention_mask']
19
 
20
+ # Define a function to encode the labels as one-hot vectors
21
+ def encode_labels(labels):
22
+ return torch.tensor(labels, dtype=torch.float)
23
+
24
+ # Define the training data and labels as PyTorch tensors
25
+ train_inputs = [preprocess(text) for text in train_texts]
26
+ train_labels = encode_labels(train_labels)
27
+
28
+ # Define the training arguments
29
+ training_args = TrainingArguments(
30
+ output_dir='./results',
31
+ num_train_epochs=3,
32
+ per_device_train_batch_size=32,
33
+ per_device_eval_batch_size=64,
34
+ warmup_steps=500,
35
+ weight_decay=0.01,
36
+ logging_dir='./logs',
37
+ logging_steps=10
38
+ )
39
+
40
+ # Define the trainer object
41
+ trainer = Trainer(
42
+ model=model,
43
+ args=training_args,
44
+ train_dataset=list(zip(train_inputs, train_labels))
45
+ )
46
+
47
+ # Train the model
48
+ trainer.train()
49
+
50
+ # Define a function to classify a text input
51
  def classify(text):
52
  input_ids, attention_mask = preprocess(text)
53
  with torch.no_grad():
 
55
  preds = torch.sigmoid(logits).squeeze().tolist()
56
  return {labels[i]: preds[i] for i in range(len(labels))}
57
 
58
+ # Example usage
59
+ text = "You are a stupid idiot"
60
  preds = classify(text)
61
+ print(preds) # Output: {'toxic': 0.98, 'severe_toxic': 0.03, 'obscene': 0.94, 'threat': 0.01, 'insult': 0.88, 'identity_hate': 0.02}