InVoS commited on
Commit
dba8b30
·
verified ·
1 Parent(s): 6e84507

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +89 -0
main.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install library yang dibutuhkan
2
+ !pip install transformers datasets
3
+
4
+ # Import library yang diperlukan
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
6
+ from datasets import load_dataset
7
+ import torch
8
+ import numpy as np
9
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
+
11
+ # Muat tokenizer dan model
12
+ tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
13
+ model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT")
14
+
15
+ # Load dataset
16
+ ds = load_dataset("celikmus/symptom_text_to_disease_01")
17
+ train_dataset = ds['train']
18
+ test_dataset = ds['test']
19
+
20
+ # Daftar gejala sebagai contoh
21
+ symptom_mapping = {
22
+ 0: "emotional pain", 1: "hair falling out", 2: "heart hurts", 3: "infected wound",
23
+ 4: "foot ache", 5: "shoulder pain", 6: "injury from sports", 7: "skin issue",
24
+ 8: "stomach ache", 9: "knee pain", 10: "joint pain", 11: "hard to breath",
25
+ 12: "head ache", 13: "body feels weak", 14: "feeling dizzy", 15: "back pain",
26
+ 16: "open wound", 17: "internal pain", 18: "blurry vision", 19: "acne",
27
+ 20: "muscle pain", 21: "neck pain", 22: "cough", 23: "ear ache", 24: "feeling cold",
28
+ }
29
+
30
+ # Tambahkan kolom baru dengan nama gejala
31
+ for entry in train_dataset:
32
+ entry['symptom_name'] = symptom_mapping[entry['labels']]
33
+
34
+ # Tampilkan contoh data
35
+ print(f"Teks: {train_dataset[0]['text']}, Nama Gejala: {train_dataset[0]['symptom_name']}")
36
+
37
+ # Fungsi untuk memproses data
38
+ def preprocess_function(examples):
39
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)
40
+
41
+ # Terapkan fungsi pemrosesan pada dataset
42
+ train_dataset = train_dataset.map(preprocess_function, batched=True)
43
+ test_dataset = test_dataset.map(preprocess_function, batched=True)
44
+
45
+ # Pastikan dataset memiliki kolom input_ids, attention_mask, dan labels
46
+ print(train_dataset.column_names)
47
+
48
+ # Ambil semua label dari dataset dan hitung jumlah label unik
49
+ labels = train_dataset['labels']
50
+ unique_labels = set(labels)
51
+ num_labels = len(unique_labels)
52
+
53
+ # Muat model dengan jumlah label yang benar
54
+ model = AutoModelForSequenceClassification.from_pretrained("medicalai/ClinicalBERT", num_labels=num_labels)
55
+
56
+ # Tentukan argumen pelatihan
57
+ training_args = TrainingArguments(
58
+ output_dir='./results',
59
+ evaluation_strategy='epoch', # Evaluasi setiap epoch
60
+ learning_rate=2e-5,
61
+ per_device_train_batch_size=8,
62
+ per_device_eval_batch_size=8,
63
+ num_train_epochs=3,
64
+ weight_decay=0.01,
65
+ )
66
+
67
+ # Fungsi untuk menghitung metrik
68
+ def compute_metrics(p):
69
+ predictions, labels = p
70
+ preds = np.argmax(predictions, axis=1)
71
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
72
+ acc = accuracy_score(labels, preds)
73
+ return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}
74
+
75
+ # Buat trainer
76
+ trainer = Trainer(
77
+ model=model,
78
+ args=training_args,
79
+ train_dataset=train_dataset,
80
+ eval_dataset=test_dataset,
81
+ compute_metrics=compute_metrics # Tambahkan fungsi compute_metrics di sini
82
+ )
83
+
84
+ # Mulai pelatihan
85
+ trainer.train()
86
+
87
+ # Lakukan evaluasi
88
+ results = trainer.evaluate()
89
+ print(results)