Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ if not HF_TOKEN:
|
|
12 |
login(HF_TOKEN)
|
13 |
|
14 |
# === 設定 ===
|
15 |
-
BASE_MODEL = "Sakalti/template-4"
|
16 |
HF_REPO = "Sakalti/template-16"
|
17 |
|
18 |
# === データ読み込み ===
|
@@ -22,9 +22,9 @@ dataset = load_dataset("Verah/JParaCrawl-Filtered-English-Japanese-Parallel-Corp
|
|
22 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
23 |
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
|
24 |
|
25 |
-
# ===
|
26 |
def preprocess(examples):
|
27 |
-
texts = [
|
28 |
tokenized = tokenizer(texts, max_length=256, truncation=True)
|
29 |
tokenized["labels"] = tokenized["input_ids"].copy()
|
30 |
return tokenized
|
|
|
12 |
login(HF_TOKEN)
|
13 |
|
14 |
# === 設定 ===
|
15 |
+
BASE_MODEL = "Sakalti/template-4"
|
16 |
HF_REPO = "Sakalti/template-16"
|
17 |
|
18 |
# === データ読み込み ===
|
|
|
22 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
23 |
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
|
24 |
|
25 |
+
# === トークナイズ関数修正版 ===
|
26 |
def preprocess(examples):
|
27 |
+
texts = [en + " " + ja for en, ja in zip(examples["en"], examples["ja"])]
|
28 |
tokenized = tokenizer(texts, max_length=256, truncation=True)
|
29 |
tokenized["labels"] = tokenized["input_ids"].copy()
|
30 |
return tokenized
|