File size: 3,114 Bytes
7489b32
0d4ee70
7489b32
 
0d4ee70
 
7489b32
 
9cb9d28
0d4ee70
9cb9d28
0d4ee70
f69cee3
9cb9d28
7489b32
b94fa53
7489b32
b94fa53
9cb9d28
 
 
b94fa53
9cb9d28
b94fa53
 
 
7489b32
 
b94fa53
9cb9d28
7489b32
9cb9d28
 
 
7489b32
 
 
9cb9d28
7489b32
b94fa53
 
 
7489b32
b94fa53
7489b32
b94fa53
 
9cb9d28
 
 
 
b94fa53
 
 
 
9cb9d28
b94fa53
 
7489b32
 
 
9cb9d28
b94fa53
 
 
94c5b67
9cb9d28
7489b32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b94fa53
7489b32
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import torch
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login, HfApi

# === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32).to(device)

# === 2️⃣ LoRA AYARLARI ===
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"],
)
model = get_peft_model(model, lora_config)

# === 3️⃣ VERİ SETİ ===
DATASET_PATH = "/home/user/app/oscar_tr.parquet"

if os.path.exists(DATASET_PATH):
    print("📂 Kaydedilmiş dataset bulundu, yükleniyor...")
    dataset = Dataset.from_parquet(DATASET_PATH)
else:
    print("🌍 Veri seti indiriliyor ve kaydediliyor...")
    dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", streaming=True, trust_remote_code=True)
    dataset = dataset.shuffle(seed=42).select(range(10000))  # Küçük subset
    dataset.to_parquet(DATASET_PATH)  # İlk çalışmada kaydediyoruz

# === 4️⃣ TOKENLEŞTİRME ===
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# === 5️⃣ EĞİTİM AYARLARI ===
training_args = TrainingArguments(
    output_dir="./mistral_lora_cpu",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=5e-4,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    optim="adamw_torch",
)

# === 6️⃣ GPU İLE EĞİTİM BAŞLATMA ===
@spaces.GPU
def train_model():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
    )
    trainer.train()
    return "✅ Model Eğitimi Tamamlandı!"

# === 7️⃣ MODELİ HUGGING FACE HUB'A YÜKLEME ===
def upload_model():
    notebook_login()  # Hugging Face hesabına giriş yap
    api = HfApi()
    api.upload_folder(
        folder_path="./mistral_lora_cpu",
        repo_id="kullanici_adin/mistral-lora-modeli",
        repo_type="model",
    )
    return "✅ Model Hugging Face Hub'a Yüklendi!"

# === 8️⃣ GRADIO ARAYÜZÜ ===
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=2, placeholder="Buraya bir şeyler yaz..."),
    outputs="text",
    live=True
)

# === 9️⃣ BAŞLATMA ===
if __name__ == "__main__":
    train_model()  # Eğitimi başlat
    upload_model()  # Modeli Hugging Face Hub'a yükle
    iface.launch()  # Gradio UI başlat