kasim90 commited on
Commit
7489b32
·
verified ·
1 Parent(s): c89668a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -21
app.py CHANGED
@@ -1,20 +1,20 @@
 
1
  import torch
 
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
  from peft import LoraConfig, get_peft_model
4
- from datasets import load_dataset
5
- import gradio as gr
6
- import os
7
 
8
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
9
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
 
12
- # === 2️⃣ CPU/GPU OPTİMİZASYONU ===
13
- torch_dtype = torch.float32 # CPU için en iyi seçenek
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype).to(device)
16
 
17
- # === 3️⃣ LoRA AYARLARI ===
18
  lora_config = LoraConfig(
19
  r=8,
20
  lora_alpha=32,
@@ -24,26 +24,25 @@ lora_config = LoraConfig(
24
  )
25
  model = get_peft_model(model, lora_config)
26
 
27
- # === 4️⃣ VERİ SETİ YÜKLEME VE CACHE (OPTİMİZE) ===
28
- DATASET_PATH = "oscar_tr.parquet"
29
 
30
  if os.path.exists(DATASET_PATH):
31
- print("📂 Kaydedilmiş veri seti bulundu, yükleniyor...")
32
- from datasets import Dataset
33
  dataset = Dataset.from_parquet(DATASET_PATH)
34
  else:
35
  print("🌍 Veri seti indiriliyor ve kaydediliyor...")
36
- dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", trust_remote_code=True)
37
- dataset = dataset.shuffle(seed=42).select(range(10000)) # Küçük subset alıyoruz
38
- dataset.to_parquet(DATASET_PATH) # İlk çalışmada veriyi kaydediyoruz
39
 
40
- # === 5️⃣ TOKENLEŞTİRME (OPTİMİZE) ===
41
  def tokenize_function(examples):
42
  return tokenizer(examples["text"], truncation=True, max_length=512)
43
 
44
- tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4) # 🔥 Paralel işlem
45
 
46
- # === 6️⃣ EĞİTİM AYARLARI ===
47
  training_args = TrainingArguments(
48
  output_dir="./mistral_lora_cpu",
49
  per_device_train_batch_size=1,
@@ -55,16 +54,45 @@ training_args = TrainingArguments(
55
  logging_dir="./logs",
56
  logging_steps=10,
57
  optim="adamw_torch",
58
- dataloader_pin_memory=True, # 🔥 GPU bellek optimizasyonu
59
  )
60
 
61
- # === 7️⃣ MODEL EĞİTİMİ ===
62
- def trainf():
 
63
  trainer = Trainer(
64
  model=model,
65
  args=training_args,
66
  train_dataset=tokenized_datasets,
67
  )
68
  trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- trainf()
 
 
 
 
 
1
+ import os
2
  import torch
3
+ import gradio as gr
4
+ import spaces
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
6
  from peft import LoraConfig, get_peft_model
7
+ from datasets import load_dataset, Dataset
8
+ from huggingface_hub import notebook_login, HfApi
 
9
 
10
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
11
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
 
 
 
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32).to(device)
16
 
17
+ # === 2️⃣ LoRA AYARLARI ===
18
  lora_config = LoraConfig(
19
  r=8,
20
  lora_alpha=32,
 
24
  )
25
  model = get_peft_model(model, lora_config)
26
 
27
+ # === 3️⃣ VERİ SETİ ===
28
+ DATASET_PATH = "/home/user/app/oscar_tr.parquet"
29
 
30
  if os.path.exists(DATASET_PATH):
31
+ print("📂 Kaydedilmiş dataset bulundu, yükleniyor...")
 
32
  dataset = Dataset.from_parquet(DATASET_PATH)
33
  else:
34
  print("🌍 Veri seti indiriliyor ve kaydediliyor...")
35
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", streaming=True, trust_remote_code=True)
36
+ dataset = dataset.shuffle(seed=42).select(range(10000)) # Küçük subset
37
+ dataset.to_parquet(DATASET_PATH) # İlk çalışmada kaydediyoruz
38
 
39
+ # === 4️⃣ TOKENLEŞTİRME ===
40
  def tokenize_function(examples):
41
  return tokenizer(examples["text"], truncation=True, max_length=512)
42
 
43
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
44
 
45
+ # === 5️⃣ EĞİTİM AYARLARI ===
46
  training_args = TrainingArguments(
47
  output_dir="./mistral_lora_cpu",
48
  per_device_train_batch_size=1,
 
54
  logging_dir="./logs",
55
  logging_steps=10,
56
  optim="adamw_torch",
 
57
  )
58
 
59
+ # === 6️⃣ GPU İLE EĞİTİM BAŞLATMA ===
60
+ @spaces.GPU
61
+ def train_model():
62
  trainer = Trainer(
63
  model=model,
64
  args=training_args,
65
  train_dataset=tokenized_datasets,
66
  )
67
  trainer.train()
68
+ return "✅ Model Eğitimi Tamamlandı!"
69
+
70
+ # === 7️⃣ MODELİ HUGGING FACE HUB'A YÜKLEME ===
71
+ def upload_model():
72
+ notebook_login() # Hugging Face hesabına giriş yap
73
+ api = HfApi()
74
+ api.upload_folder(
75
+ folder_path="./mistral_lora_cpu",
76
+ repo_id="kullanici_adin/mistral-lora-modeli",
77
+ repo_type="model",
78
+ )
79
+ return "✅ Model Hugging Face Hub'a Yüklendi!"
80
+
81
+ # === 8️⃣ GRADIO ARAYÜZÜ ===
82
+ def generate_text(prompt):
83
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
84
+ output = model.generate(**inputs, max_length=100)
85
+ return tokenizer.decode(output[0], skip_special_tokens=True)
86
+
87
+ iface = gr.Interface(
88
+ fn=generate_text,
89
+ inputs=gr.Textbox(lines=2, placeholder="Buraya bir şeyler yaz..."),
90
+ outputs="text",
91
+ live=True
92
+ )
93
 
94
+ # === 9️⃣ BAŞLATMA ===
95
+ if __name__ == "__main__":
96
+ train_model() # Eğitimi başlat
97
+ upload_model() # Modeli Hugging Face Hub'a yükle
98
+ iface.launch() # Gradio UI başlat