kasim90 commited on
Commit
9cb9d28
·
verified ·
1 Parent(s): b94fa53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -42
app.py CHANGED
@@ -2,78 +2,69 @@ import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
  from peft import LoraConfig, get_peft_model
4
  from datasets import load_dataset
5
- import spaces
6
  import gradio as gr
 
 
7
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
8
- MODEL_NAME = "mistralai/Mistral-7B-v0.1" # Hugging Face model adı
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
 
11
  # === 2️⃣ CPU OPTİMİZASYONU ===
12
-
13
-
14
-
15
-
16
- zero = torch.Tensor([0]).cuda()
17
- print(zero.device) # <-- 'cpu' 🤔
18
-
19
-
20
- print(zero.device) # <-- 'cuda:0' 🤗
21
-
22
-
23
- print(zero.device) # <-- 'cuda:0' 🤗
24
-
25
-
26
-
27
- torch_dtype = torch.float32 # float32 seçtik çünkü CPU'da bf16 genelde yok
28
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype)
29
 
30
  # === 3️⃣ LoRA AYARLARI ===
31
  lora_config = LoraConfig(
32
- r=8,
33
- lora_alpha=32,
34
- lora_dropout=0.1,
35
  bias="none",
36
- target_modules=["q_proj", "v_proj"],
37
  )
38
  model = get_peft_model(model, lora_config)
39
 
40
- # === 4️⃣ VERİ SETİ ===
41
- dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", trust_remote_code=True) # 🔥 ÇÖZÜM: trust_remote_code=True
42
- train_data = dataset["train"].shuffle(seed=42).select(range(10000)) # Küçük subset
43
 
44
- # === 5️⃣ TOKENLEŞTİRME FONKSİYONU ===
 
 
 
 
 
 
 
 
 
 
45
  def tokenize_function(examples):
46
  return tokenizer(examples["text"], truncation=True, max_length=512)
47
 
48
- tokenized_datasets = train_data.map(tokenize_function, batched=True)
49
 
50
  # === 6️⃣ EĞİTİM AYARLARI ===
51
  training_args = TrainingArguments(
52
  output_dir="./mistral_lora_cpu",
53
- per_device_train_batch_size=1,
54
- gradient_accumulation_steps=16,
55
- learning_rate=5e-4,
56
- num_train_epochs=1,
57
  save_steps=500,
58
  save_total_limit=2,
59
  logging_dir="./logs",
60
  logging_steps=10,
61
- optim="adamw_torch", # 🔥 ÇÖZÜM: bitsandbytes yerine adamw_torch
 
62
  )
63
 
64
  # === 7️⃣ MODEL EĞİTİMİ ===
65
- @spaces.GPU
66
  def trainf():
67
-
68
- v= Trainer(
69
-
70
  model=model,
71
-
72
  args=training_args,
73
-
74
  train_dataset=tokenized_datasets,
75
-
76
  )
77
- return v
78
 
79
- trainf().train()
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
  from peft import LoraConfig, get_peft_model
4
  from datasets import load_dataset
 
5
  import gradio as gr
6
+ import os
7
+
8
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
9
+ MODEL_NAME = "mistralai/Mistral-7B-v0.1"
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
  # === 2️⃣ CPU OPTİMİZASYONU ===
13
+ torch_dtype = torch.float32 # CPU için float32 en iyisi
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # === 3️⃣ LoRA AYARLARI ===
18
  lora_config = LoraConfig(
19
+ r=8,
20
+ lora_alpha=32,
21
+ lora_dropout=0.1,
22
  bias="none",
23
+ target_modules=["q_proj", "v_proj"],
24
  )
25
  model = get_peft_model(model, lora_config)
26
 
27
+ # === 4️⃣ VERİ SETİ (OPTİMİZE) ===
28
+ DATASET_PATH = "oscar_tr.parquet"
 
29
 
30
+ if os.path.exists(DATASET_PATH):
31
+ print("📂 Kaydedilmiş veri seti bulundu, yükleniyor...")
32
+ from datasets import Dataset
33
+ dataset = Dataset.from_parquet(DATASET_PATH)
34
+ else:
35
+ print("🌍 Veri seti indiriliyor ve kaydediliyor...")
36
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train")
37
+ dataset = dataset.shuffle(seed=42).select(range(10000)) # 10K veri ile sınırladık
38
+ dataset.to_parquet(DATASET_PATH) # İlk çalışmada veriyi kaydediyoruz
39
+
40
+ # === 5️⃣ TOKENLEŞTİRME (OPTİMİZE) ===
41
  def tokenize_function(examples):
42
  return tokenizer(examples["text"], truncation=True, max_length=512)
43
 
44
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4) # 🔥 Paralel işlem
45
 
46
  # === 6️⃣ EĞİTİM AYARLARI ===
47
  training_args = TrainingArguments(
48
  output_dir="./mistral_lora_cpu",
49
+ per_device_train_batch_size=1,
50
+ gradient_accumulation_steps=16,
51
+ learning_rate=5e-4,
52
+ num_train_epochs=1,
53
  save_steps=500,
54
  save_total_limit=2,
55
  logging_dir="./logs",
56
  logging_steps=10,
57
+ optim="adamw_torch",
58
+ dataloader_pin_memory=True, # 🔥 GPU bellek optimizasyonu
59
  )
60
 
61
  # === 7️⃣ MODEL EĞİTİMİ ===
 
62
  def trainf():
63
+ trainer = Trainer(
 
 
64
  model=model,
 
65
  args=training_args,
 
66
  train_dataset=tokenized_datasets,
 
67
  )
68
+ trainer.train()
69
 
70
+ trainf()