gautamnancy commited on
Commit
53c4a01
·
verified ·
1 Parent(s): 977d821

Delete spam_detection_pipeline.md

Browse files
Files changed (1) hide show
  1. spam_detection_pipeline.md +0 -151
spam_detection_pipeline.md DELETED
@@ -1,151 +0,0 @@
1
-
2
- # Spam Detection using DistilBERT and Quantization
3
-
4
- ## 🛠 Install Dependencies
5
-
6
- ```bash
7
- !pip install transformers datasets evaluate scikit-learn torch
8
- !pip install evaluate
9
- ```
10
-
11
- ## 📥 Step 1: Load and Reduce Dataset
12
-
13
- ```python
14
- from datasets import load_dataset
15
- dataset = load_dataset("yelp_polarity")
16
- dataset["train"] = dataset["train"].shuffle(seed=42).select(range(50000))
17
- dataset["test"] = dataset["test"].shuffle(seed=42).select(range(10000))
18
- ```
19
-
20
- ## ✂️ Step 2: Tokenization
21
-
22
- ```python
23
- from transformers import AutoTokenizer
24
- model_name = "distilbert-base-uncased"
25
- tokenizer = AutoTokenizer.from_pretrained(model_name)
26
-
27
- def tokenize_function(example):
28
- return tokenizer(example["text"], padding="max_length", truncation=True)
29
-
30
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
31
- ```
32
-
33
- ## 🏷 Step 3: Rename 'label' to 'labels' and Set Format
34
-
35
- ```python
36
- tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
37
- tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
38
- ```
39
-
40
- ## 🧠 Step 4: Load Model
41
-
42
- ```python
43
- from transformers import AutoModelForSequenceClassification
44
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
45
- ```
46
-
47
- ## 📊 Step 5: Define Metrics
48
-
49
- ```python
50
- import numpy as np
51
- from sklearn.metrics import accuracy_score, precision_recall_fscore_support
52
-
53
- def compute_metrics(eval_pred):
54
- logits, labels = eval_pred
55
- preds = np.argmax(logits, axis=-1)
56
- acc = accuracy_score(labels, preds)
57
- precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
58
- return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
59
- ```
60
-
61
- ## ⚙️ Step 6: Training Setup
62
-
63
- ```python
64
- from transformers import TrainingArguments, Trainer
65
-
66
- training_args = TrainingArguments(
67
- output_dir="./results",
68
- eval_strategy="epoch",
69
- learning_rate=2e-5,
70
- per_device_train_batch_size=16,
71
- per_device_eval_batch_size=16,
72
- num_train_epochs=3,
73
- weight_decay=0.01,
74
- logging_dir="./logs",
75
- logging_steps=10,
76
- )
77
-
78
- trainer = Trainer(
79
- model=model,
80
- args=training_args,
81
- train_dataset=tokenized_datasets["train"],
82
- eval_dataset=tokenized_datasets["test"],
83
- compute_metrics=compute_metrics,
84
- )
85
- ```
86
-
87
- ## 🚀 Step 7: Train
88
-
89
- ```python
90
- trainer.train()
91
- trainer.save_model("./results")
92
- tokenizer.save_pretrained("./results")
93
- ```
94
-
95
- ## 🔍 Step 8: Inference on Sample Texts
96
-
97
- ```python
98
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
99
- import torch
100
-
101
- model = AutoModelForSequenceClassification.from_pretrained("./results")
102
- tokenizer = AutoTokenizer.from_pretrained("./results")
103
- model.eval()
104
-
105
- sample_texts = [
106
- "The food was absolutely wonderful!",
107
- "Terrible experience. I will never come back.",
108
- "Average service, but the food was decent.",
109
- "I loved the ambiance and the staff was super friendly!",
110
- "Worst food I've had in a long time.",
111
- "Highly recommend this place for a date night.",
112
- "The waiter was rude and the food was cold.",
113
- "Amazing pizza, will order again!",
114
- "They took too long to serve and it was overpriced.",
115
- "Best customer service and delicious desserts!"
116
- ]
117
-
118
- for text in sample_texts:
119
- inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
120
- with torch.no_grad():
121
- outputs = model(**inputs)
122
- prediction = torch.argmax(outputs.logits, dim=-1).item()
123
- sentiment = "Positive" if prediction == 1 else "Negative"
124
- print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")
125
- ```
126
-
127
- ## 📦 Step 9: Quantize the Model
128
-
129
- ```python
130
- import os
131
- import torch
132
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
133
-
134
- model = AutoModelForSequenceClassification.from_pretrained("./results")
135
-
136
- quantized_model = torch.quantization.quantize_dynamic(
137
- model,
138
- {torch.nn.Linear},
139
- dtype=torch.qint8
140
- )
141
-
142
- quantized_model_path = "./results/quantized_model"
143
- os.makedirs(quantized_model_path, exist_ok=True)
144
-
145
- torch.save(quantized_model.state_dict(), f"{quantized_model_path}/pytorch_model.bin")
146
- model.config.save_pretrained(quantized_model_path)
147
- tokenizer = AutoTokenizer.from_pretrained("./results")
148
- tokenizer.save_pretrained(quantized_model_path)
149
-
150
- print("✅ Quantized model saved at:", quantized_model_path)
151
- ```