ipatate commited on
Commit
781432b
·
1 Parent(s): f134b07
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ results
2
+ logs
3
+ .DS_Store
README.md CHANGED
@@ -1,3 +1,29 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+ # SPAM Mail Classifier
5
+
6
+ This model is fine-tuned from `microsoft/Multilingual-MiniLM-L12-H384` to classify email subjects as SPAM or NOSPAM.
7
+
8
+ ## Model Details
9
+
10
+ - **Base model**: `microsoft/Multilingual-MiniLM-L12-H384`
11
+ - **Fine-tuned for**: Text classification
12
+ - **Number of classes**: 2 (SPAM, NOSPAM)
13
+ - **Languages**: Multilingual
14
+
15
+ ## Usage
16
+
17
+ ```python
18
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
19
+
20
+ model_name = "Goodmotion/spam-mail-classifier"
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
+
25
+ text = "Félicitations ! Vous avez gagné un iPhone."
26
+ inputs = tokenizer(text, return_tensors="pt")
27
+ outputs = model(**inputs)
28
+
29
+ print(outputs.logits)
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e88f8a89ca5b10b211317283d2d7e7eae566ac99298bc8b64c3ece9a8c62b1e
3
+ size 408
spam-classifier/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da19d2b12d022afe7e2b6f0534f39c2087f8005e0251f603602dac87edcbf01
3
+ size 758
spam-classifier/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d1982a746320f27454128dfe77b87347cfdb6471d4761d9a72bff8bb6e47bd
3
+ size 470641664
spam-classifier/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
spam-classifier/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06e405a36dfe4b9604f484f6a1e619af1a7f7d09e34a8555eb0b77b66318067f
3
+ size 280
spam-classifier/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b74659c780d49afad7a7b9799868f75cbd3014fb6c34956e85a793028d38094a
3
+ size 17098251
spam-classifier/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:511d98e87c7de2d3b649bea83a6cf1e028b9364457506277f114b57cdfb5b1a2
3
+ size 1201
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2207e01f191626729e08582912c9bf23876883924839b2bbee97489f804e00e
3
+ size 125
tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeb6851d685bd6fbc691b44583cb8a367d217341b8413204ae34ebcb2e0bdc92
3
+ size 206
training.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
2
+ from datasets import load_dataset
3
+
4
+ model_name = "microsoft/Multilingual-MiniLM-L12-H384"
5
+
6
+
7
+ dataset = load_dataset("Goodmotion/spam-mail")
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ # insert labels
12
+ def encode_labels(data):
13
+ label_map = {"SPAM": 1, "NOSPAM": 0}
14
+ data["label"] = label_map[data["label"]]
15
+ return data
16
+
17
+ def tokenize_data(data):
18
+ return tokenizer(
19
+ data["text"],
20
+ padding="max_length",
21
+ truncation=True,
22
+ max_length=128
23
+ )
24
+
25
+ # tokenize the dataset
26
+ tokenized_dataset = dataset.map(tokenize_data, batched=True)
27
+
28
+ # define the model
29
+ model = AutoModelForSequenceClassification.from_pretrained(
30
+ "microsoft/Multilingual-MiniLM-L12-H384",
31
+ num_labels=2
32
+ )
33
+ model.classifier.weight.data.normal_(mean=0.0, std=0.02)
34
+ model.classifier.bias.data.zero_()
35
+
36
+
37
+ training_args = TrainingArguments(
38
+ output_dir="./results",
39
+ # speed training
40
+ learning_rate=5e-5,
41
+ # 16 examples per device
42
+ per_device_train_batch_size=16,
43
+ # 3 times on the same data
44
+ num_train_epochs=3,
45
+ # weight coef
46
+ weight_decay=0.01,
47
+ logging_dir='./logs'
48
+ )
49
+
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ train_dataset=tokenized_dataset["train"],
54
+ )
55
+
56
+ # train the model
57
+ trainer.train()
58
+
59
+ # save the model
60
+ model.save_pretrained("./spam-classifier")
61
+ # save the tokenizer
62
+ tokenizer.save_pretrained("./spam-classifier")