Sakalti commited on
Commit
dae2dae
·
verified ·
1 Parent(s): b7e4c73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -12,7 +12,7 @@ if not HF_TOKEN:
12
  login(HF_TOKEN)
13
 
14
  # === 設定 ===
15
- BASE_MODEL = "Sakalti/template-4" # 修正対象モデル名
16
  HF_REPO = "Sakalti/template-16"
17
 
18
  # === データ読み込み ===
@@ -22,9 +22,9 @@ dataset = load_dataset("Verah/JParaCrawl-Filtered-English-Japanese-Parallel-Corp
22
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
23
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
24
 
25
- # === 超簡素トークナイズ関数 ===
26
  def preprocess(examples):
27
- texts = [ex["en"] + " " + ex["ja"] for ex in examples]
28
  tokenized = tokenizer(texts, max_length=256, truncation=True)
29
  tokenized["labels"] = tokenized["input_ids"].copy()
30
  return tokenized
 
12
  login(HF_TOKEN)
13
 
14
  # === 設定 ===
15
+ BASE_MODEL = "Sakalti/template-4"
16
  HF_REPO = "Sakalti/template-16"
17
 
18
  # === データ読み込み ===
 
22
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
23
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
24
 
25
+ # === トークナイズ関数修正版 ===
26
  def preprocess(examples):
27
+ texts = [en + " " + ja for en, ja in zip(examples["en"], examples["ja"])]
28
  tokenized = tokenizer(texts, max_length=256, truncation=True)
29
  tokenized["labels"] = tokenized["input_ids"].copy()
30
  return tokenized