kasim90 commited on
Commit
c139ec7
·
verified ·
1 Parent(s): 536a6d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -55
app.py CHANGED
@@ -1,19 +1,13 @@
1
- import os
2
  import torch
3
- import gradio as gr
4
  import spaces
5
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
6
  from peft import LoraConfig, get_peft_model
7
- from datasets import load_dataset, Dataset
8
- from huggingface_hub import notebook_login, HfApi
9
 
10
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
11
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
 
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32).to(device)
16
-
17
  # === 2️⃣ LoRA AYARLARI ===
18
  lora_config = LoraConfig(
19
  r=8,
@@ -22,35 +16,19 @@ lora_config = LoraConfig(
22
  bias="none",
23
  target_modules=["q_proj", "v_proj"],
24
  )
25
- model = get_peft_model(model, lora_config)
26
 
27
  # === 3️⃣ VERİ SETİ ===
28
- DATASET_PATH = "/home/user/app/oscar_tr.parquet"
29
-
30
- from datasets import Dataset
31
-
32
- DATASET_PATH = "/home/user/app/oscar_tr.parquet"
33
 
34
- if os.path.exists(DATASET_PATH):
35
- print("📂 Kaydedilmiş dataset bulundu, yükleniyor...")
36
- dataset = Dataset.from_parquet(DATASET_PATH)
37
- else:
38
- print("🌍 Veri seti indiriliyor ve kaydediliyor...")
39
- raw_dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", streaming=True, trust_remote_code=True)
40
- dataset_list = list(raw_dataset.take(10000)) # İlk 10.000 veriyi listeye al
41
- dataset = Dataset.from_list(dataset_list) # Listeyi Dataset formatına çevir
42
- dataset.to_parquet(DATASET_PATH) # İlk çalışmada kaydet
43
-
44
-
45
- # === 4️⃣ TOKENLEŞTİRME ===
46
  def tokenize_function(examples):
47
  return tokenizer(examples["text"], truncation=True, max_length=512)
48
 
49
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
50
 
51
- # === 5️⃣ EĞİTİM AYARLARI ===
52
  training_args = TrainingArguments(
53
- output_dir="./mistral_lora_cpu",
54
  per_device_train_batch_size=1,
55
  gradient_accumulation_steps=16,
56
  learning_rate=5e-4,
@@ -62,9 +40,13 @@ training_args = TrainingArguments(
62
  optim="adamw_torch",
63
  )
64
 
65
- # === 6️⃣ GPU İLE EĞİTİM BAŞLATMA ===
66
  @spaces.GPU
67
  def train_model():
 
 
 
 
68
  trainer = Trainer(
69
  model=model,
70
  args=training_args,
@@ -73,32 +55,6 @@ def train_model():
73
  trainer.train()
74
  return "✅ Model Eğitimi Tamamlandı!"
75
 
76
- # === 7️⃣ MODELİ HUGGING FACE HUB'A YÜKLEME ===
77
- def upload_model():
78
- notebook_login() # Hugging Face hesabına giriş yap
79
- api = HfApi()
80
- api.upload_folder(
81
- folder_path="./mistral_lora_cpu",
82
- repo_id="kullanici_adin/mistral-lora-modeli",
83
- repo_type="model",
84
- )
85
- return "✅ Model Hugging Face Hub'a Yüklendi!"
86
-
87
- # === 8️⃣ GRADIO ARAYÜZÜ ===
88
- def generate_text(prompt):
89
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
90
- output = model.generate(**inputs, max_length=100)
91
- return tokenizer.decode(output[0], skip_special_tokens=True)
92
-
93
- iface = gr.Interface(
94
- fn=generate_text,
95
- inputs=gr.Textbox(lines=2, placeholder="Buraya bir şeyler yaz..."),
96
- outputs="text",
97
- live=True
98
- )
99
-
100
- # === 9️⃣ BAŞLATMA ===
101
  if __name__ == "__main__":
102
  train_model() # Eğitimi başlat
103
- upload_model() # Modeli Hugging Face Hub'a yükle
104
- iface.launch() # Gradio UI başlat
 
 
1
  import torch
 
2
  import spaces
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
4
  from peft import LoraConfig, get_peft_model
5
+ from datasets import load_dataset
 
6
 
7
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
8
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
 
 
 
 
11
  # === 2️⃣ LoRA AYARLARI ===
12
  lora_config = LoraConfig(
13
  r=8,
 
16
  bias="none",
17
  target_modules=["q_proj", "v_proj"],
18
  )
 
19
 
20
  # === 3️⃣ VERİ SETİ ===
21
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", streaming=True, trust_remote_code=True)
22
+ dataset = dataset.shuffle(seed=42).take(10000)
 
 
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def tokenize_function(examples):
25
  return tokenizer(examples["text"], truncation=True, max_length=512)
26
 
27
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
28
 
29
+ # === 4️⃣ EĞİTİM AYARLARI ===
30
  training_args = TrainingArguments(
31
+ output_dir="./mistral_lora",
32
  per_device_train_batch_size=1,
33
  gradient_accumulation_steps=16,
34
  learning_rate=5e-4,
 
40
  optim="adamw_torch",
41
  )
42
 
43
+ # === 5️⃣ GPU BAŞLATMA VE EĞİTİM ===
44
  @spaces.GPU
45
  def train_model():
46
+ device = "cuda" if torch.cuda.is_available() else "cpu" # CUDA'yı sadece burada başlat!
47
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32).to(device)
48
+ model = get_peft_model(model, lora_config)
49
+
50
  trainer = Trainer(
51
  model=model,
52
  args=training_args,
 
55
  trainer.train()
56
  return "✅ Model Eğitimi Tamamlandı!"
57
 
58
+ # === 6️⃣ BAŞLATMA ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  if __name__ == "__main__":
60
  train_model() # Eğitimi başlat