Gamoooo
/

llm-jp-3-13b-last

Safetensors

Model card Files Files and versions Community

Gamoooo commited on Dec 24, 2024

Commit

0161dc3

verified ·

1 Parent(s): b07fbc1

Update README.md

Browse files

Files changed (1) hide show

README.md +43 -203

README.md CHANGED Viewed

@@ -6,7 +6,6 @@ tags:
 - unsloth
 - llama
 - trl
-license: apache-2.0
 language:
 - en
 - ja
@@ -24,27 +23,27 @@ This llama model was trained 2x faster with [Unsloth](https://github.com/unsloth
 # 推論用コード
 # Hugging Faceにアップロードしたモデルを用いてELYZA-tasks-100-TVの出力を得るためのコードです。 このコードで生成されたjsonlファイルは課題の成果として提出可能なフォーマットになっております。
-!pip uninstall unsloth -y
-!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
-!pip install --upgrade torch
-!pip install --upgrade xformers
-# Install Flash Attention 2 for softcapping support
-import torch
-if torch.cuda.get_device_capability()[0] >= 8:
-    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from unsloth import FastLanguageModel
 import torch
-max_seq_length = 512
-dtype = None
-load_in_4bit = True
 model_id = "llm-jp/llm-jp-3-13b"
-new_model_id = "llm-jp-3-13b-last"
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_id,
@@ -53,203 +52,44 @@ model, tokenizer = FastLanguageModel.from_pretrained(
     trust_remote_code=True,
 )
-# SFT用のモデルを用意
-model = FastLanguageModel.get_peft_model(
-    model,
-    r=32,
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
-                    "gate_proj", "up_proj", "down_proj"],
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    use_gradient_checkpointing="unsloth",
-    random_state=3407,
-    use_rslora=False,
-    loftq_config=None,
-    max_seq_length=max_seq_length,
-)
-# https://huggingface.co/settings/tokens
-HF_TOKEN = "your-token"  # @param {type:"string"}
-from datasets import load_dataset, concatenate_datasets
-# データセットのロード
-ichikara_dataset = load_dataset("json", data_files="/content/ichikara-instruction-003-001-1.json")
-elyza_dataset = load_dataset("elyza/ELYZA-tasks-100")
-EOS_TOKEN = tokenizer.eos_token  #
-# 学習時のプロンプトフォーマットの定義
-prompt = """### 指示
-{}
-### 回答
-{}"""
-"""
-formatting_prompts_func: 各データをプロンプトに合わせた形式に合わせる
-"""
-def formatting_prompts_func(examples):
-    input = examples["text"]
-    output = examples["output"]
-    text = prompt.format(input, output) + EOS_TOKEN
-    return {"formatted_text": text}
-# ichikara-instruction のデータフォーマット
-ichikara_dataset = ichikara_dataset.map(
-    formatting_prompts_func,
-    num_proc=4,
-)
-# ELYZA-tasks-100 データセットのフォーマット関数
-def elyza_formatting_prompts_func(examples):
-    input = examples["input"]
-    output = examples["output"]
-    text = prompt.format(input, output) + EOS_TOKEN
-    return {"formatted_text": text}
-# ELYZA-tasks-100 のデータフォーマット
-elyza_dataset = elyza_dataset.map(
-    elyza_formatting_prompts_func,
-    num_proc=4
-)
-from datasets import concatenate_datasets
-# ichikara-instruction と ELYZA-tasks-100 を統合
-combined_dataset = concatenate_datasets([
-    ichikara_dataset["train"],
-    elyza_dataset["test"]
-])
-# データ品質チェック
-# 1. ランダムサンプルを確認
-import random
-sample_indices = random.sample(range(len(combined_dataset)), 10)
-for idx in sample_indices:
-    print(combined_dataset[idx]["formatted_text"])
-# 2. 自動検査ルール
-# 短すぎるデータをチェック（Noneチェックを追加）
-short_data = combined_dataset.filter(
-    lambda x: x["input"] is not None and x["output"] is not None and (len(x["input"]) < 5 or len(x["output"]) < 5)
-)
-print(f"\n短すぎるデータ数: {len(short_data)}")
-# 指示と回答が同一のデータ（Noneチェックを追加）
-duplicate_data = combined_dataset.filter(
-    lambda x: x["input"] is not None and x["output"] is not None and x["input"].strip() == x["output"].strip()
-)
-print(f"\n指示と回答が同一のデータ数: {len(duplicate_data)}")
-# 問題のあるデータをフィルタリング（Noneチェックを追加）
-filtered_dataset = combined_dataset.filter(
-    lambda x: x["input"] is not None and x["output"] is not None and len(x["input"]) > 5 and len(x["output"]) > 5 and x["input"].strip() != x["output"].strip()
-)
-print(f"元のデータ数: {len(combined_dataset)}")
-print(f"フィルタリング後のデータ数: {len(filtered_dataset)}")
-print(f"除外されたデータ数: {len(combined_dataset) - len(filtered_dataset)}")
-# フィルタリング後のデータの例を確認
-print(filtered_dataset[0])
-"""
-training_arguments: 学習の設定
-"""
-from trl import SFTTrainer
-from transformers import TrainingArguments
-from unsloth import is_bfloat16_supported
-trainer = SFTTrainer(
-    model=model,
-    tokenizer=tokenizer,
-    train_dataset=filtered_dataset,
-    max_seq_length=max_seq_length,
-    dataset_text_field="formatted_text",
-    packing=False,
-    args=TrainingArguments(
-        per_device_train_batch_size=2,
-        gradient_accumulation_steps=4,
-        num_train_epochs=3,
-        logging_steps=10,
-        warmup_steps=10,
-        save_steps=50,
-        save_total_limit=2,
-        max_steps=200,
-        learning_rate=2e-4,
-        fp16=not is_bfloat16_supported(),
-        bf16=is_bfloat16_supported(),
-        group_by_length=True,
-        seed=3407,
-        output_dir="outputs",
-        report_to="none",
-    ),
-)
-#@title 学習実行
-trainer_stats = trainer.train()
-import json
-from datasets import load_dataset
-dataset = load_dataset("json", data_files="/content/elyza-tasks-100-TV_0.jsonl", split="train")
-datasets = []
-with open("/content/elyza-tasks-100-TV_0.jsonl", "r", encoding="utf-8") as f:
     item = ""
     for line in f:
-        line = line.strip()
-        item += line
-        if item.endswith("}"):
-            datasets.append(json.loads(item))
-            item = ""
-from tqdm import tqdm
-import json
-# 推論するためにモデルのモードを変更
 FastLanguageModel.for_inference(model)
 results = []
 for dt in tqdm(datasets):
-    try:
-        input_text = dt["input"]
-        # プロンプトを生成
-        prompt = f"### 指示\n{input_text}\n次の要件を満たしてください：\n1. 簡潔に回答する。\n2. 必要なら箇条書きを使用して要点を整理する。\n3. 指示された内容に忠実に答える。\n### 回答\n"
-        # トークナイズ
-        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
-        # 推論
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=512,
-            use_cache=True,
-            do_sample=False,
-            repetition_penalty=1.2,
-        )
-        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
-        # 結果を保存
-        results.append({"task_id": dt["task_id"], "input": input_text, "output": prediction})
-    except Exception as e:
-        print(f"Error processing task_id {dt.get('task_id', 'Unknown')}: {e}")
-        results.append({"task_id": dt.get("task_id", "Unknown"), "input": dt.get("input", ""), "output": "Error"})
-# 結果をJSONL形式で保存
-output_file_jsonl = "/content/llm-jp-3-13b-last.jsonl"
-with open(output_file_jsonl, "w", encoding="utf-8") as f:
     for result in results:
-        f.write(json.dumps(result, ensure_ascii=False) + "\n")
-model.push_to_hub_merged(
-    new_model_id,
-    tokenizer=tokenizer,
-    save_method="lora",
-    token=HF_TOKEN,
-    private=True
-)

 - unsloth
 - llama
 - trl
 language:
 - en
 - ja
 # 推論用コード
 # Hugging Faceにアップロードしたモデルを用いてELYZA-tasks-100-TVの出力を得るためのコードです。 このコードで生成されたjsonlファイルは課題の成果として提出可能なフォーマットになっております。
+# セットアップ
+!pip install unsloth
+!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
+!pip install -U torch
+!pip install -U peft
+# モデル・トークナイザの読み込み
 from unsloth import FastLanguageModel
+from peft import PeftModel
 import torch
+import json
+from tqdm import tqdm
+import re
 model_id = "llm-jp/llm-jp-3-13b"
+adapter_id = "Gamoooo/llm-jp-3-13b-last"
+HF_TOKEN = "your-token" #@param {type:"string"}
+dtype = None
+load_in_4bit = True
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name=model_id,
     trust_remote_code=True,
 )
+model = PeftModel.from_pretrained(model, adapter_id, token = HF_TOKEN)
+# 入力データの準備
+datasets = []
+with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
     item = ""
     for line in f:
+      line = line.strip()
+      item += line
+      if item.endswith("}"):
+        datasets.append(json.loads(item))
+        item = ""
+# 推論実行
 FastLanguageModel.for_inference(model)
 results = []
 for dt in tqdm(datasets):
+    input = dt["input"]
+    prompt = f"### 指示\n{input}\n次の要件を満たしてください：\n1. 簡潔に回答する。\n2. 必要なら箇条書きを使用して要点を整理する。\n3. 指示された内容に忠実に答える。\n### 回答\n"
+    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=512,
+        use_cache=True,
+        do_sample=False,
+        repetition_penalty=1.2,
+    )
+    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
+    results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
+# 出力の保存
+json_file_id = re.sub(".*/", "", adapter_id)
+with open(f"/content/{json_file_id}_output.jsonl", 'w', encoding='utf-8') as f:
     for result in results:
+        json.dump(result, f, ensure_ascii=False)
+        f.write('\n')