Uploaded model

  • Developed by: kmd2525
  • License: apache-2.0
  • Finetuned from model : llm-jp/llm-jp-3-13b

This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

Usage

Execute following code in Google Colab using L4 GPU.

  • System Configuration:

    • System RAM: 53.0 GB
    • GPU RAM: 22.5 GB
    • Disk Capacity: 235.7GB
  • It took 35 minutes.

# ライブラリのインストール
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# ライブラリの読込み
import os
import json
import re
from datetime import datetime, timedelta, timezone
import torch
from tqdm.notebook import tqdm
from datasets import load_dataset
from unsloth import FastLanguageModel
from google.colab import userdata

# Hugging Faceの認証
# 注意: Hugging Faceからsecret keyを取得し、colabでsecretを設定をすること。
# 初回実行時は、認証のモーダルが表示されるので、GUI操作でアクセス許可をする。
HF_TOKEN = userdata.get("HF_TOKEN")  


# モデルの読込み
MODEL_NAME = "kmd2525/llm-jp-3-13b-it-r64-ichikara-fix"  # 学習したLoRAのモデルの指定(Hugging FaceのIDを指定)。
MAX_SWQ_LENGTH = 1024
DTYPE = None  # Noneにしておけば自動で設定
LOAD_IN_4BIT = True  # 今回は13Bモデルを扱うためTrue

os.environ["llm_int8_enable_fp32_cpu_offload"] = "True"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SWQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
    token=HF_TOKEN,
    device_map="auto",
)
FastLanguageModel.for_inference(model)


# 評価対象となるデータの読込み。
# 注意: 事前に、colabの/content直下にデータをアップロードすること。
DATASET_PATH = "./elyza-tasks-100-TV_0.jsonl"

datasets = []
with open(DATASET_PATH, "r") as f:
    item = ""
    for line in f:
      line = line.strip()
      item += line
      if item.endswith("}"):
        datasets.append(json.loads(item))
        item = ""

# 推論
MAX_NEW_TOKENS = 1024

category_prompts = {
    "creative": "創作的な回答になるよう表現を工夫してください。",
    "summarize": "与えられた情報を正確かつ簡潔にまとめることが求められます。必要な情報を的確に抽出してください。",
    "knowledge": "事実ベースの情報や知識提供が求められます。一般的に認められた情報に基づき、回答してください。",
    "advice": "実用的なアドバイスや問題解決の提案が求められます。実用性のある回答になるよう工夫してください。",
    "analysis": "論理的思考や推論・分析力が問われます。指示の情報を注意深く読んで、結論を導いてください。",
    "format": "指定形式や特定のスタイルに従う回答が求められます。指示にある回答形式を守って回答してください。",
    "evaluation": "評価・採点・判定が求められます。与えられた基準に沿って、公平かつ一貫した評価を行ってください。"
}

def get_category_prompt(task_id: int) -> str:
    # カテゴリーに分類
    creative_ids = [1, 32, 68, 84, 91, 92, 93, 98]
    summarize_ids = [24, 73]
    knowledge_ids = [2, 7, 9, 14, 19, 20, 21, 22, 27, 28, 30, 39, 50, 52, 53, 69]
    advice_ids = [0, 4, 13, 15, 31, 35, 36, 85, 86, 87, 96]
    analysis_ids = [3, 6, 8, 17, 25, 26, 33, 34, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47,
                    48, 49, 51, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 71,
                    74, 76, 77, 79, 80, 81, 82, 89, 94, 95, 97, 99]
    format_ids = [10, 11, 12, 88, 90]
    evaluation_ids = [5]

    if task_id in creative_ids:
        return category_prompts["creative"]
    elif task_id in summarize_ids:
        return category_prompts["summarize"]
    elif task_id in knowledge_ids:
        return category_prompts["knowledge"]
    elif task_id in advice_ids:
        return category_prompts["advice"]
    elif task_id in analysis_ids:
        return category_prompts["analysis"]
    elif task_id in format_ids:
        return category_prompts["format"]
    elif task_id in evaluation_ids:
        return category_prompts["evaluation"]
    else:
        return "なし"

def generate_response(task_id: int, input_text: str) -> str:
    category_prompt = get_category_prompt(task_id)
    prompt = f"""
    以下の指示の内容を読み、回答せよ。方針も適宜、参考にせよ。
    ### 指示
    {input_text}

    ### 方針
    {category_prompt}

    ### 回答
    """
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    outputs = model.generate(
      **inputs,
      max_new_tokens=MAX_NEW_TOKENS,
      use_cache=True,
      do_sample=False,
      repetition_penalty=1.2,
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### 回答")[-1]
    return prediction

results = []
for i in tqdm(range(len(datasets))):
    task_id = datasets[i]["task_id"]
    input_text = datasets[i]["input"]
    category_prompt = get_category_prompt(task_id)
    prediction = generate_response(task_id, input_text)
    results.append({"task_id": task_id, "input": input_text, "attention": category_prompt, "output": prediction})

# 出力値の後処理
def remove_invalid_chars(text: str) -> str:
  # Remove characters like \u3000 and other control characters
  text = re.sub(r'[\u3000\u0000-\u001F\u007F-\u009F]', '', text)  # \u3000とその他の制御文字の削除
  text = re.sub(r"[*#]", "", text)  # *と#の削除
  text = text.strip()  # 両端の空白を削除
  return text

cleaned_results = []
for result in results:
  cleaned_result = {
      "task_id": result["task_id"],
      "input": remove_invalid_chars(result["input"]),
      "attention": result["attention"],
      "output": remove_invalid_chars(result["output"])
  }
  cleaned_results.append(cleaned_result)

# 結果をjsonlで、/content直下に保存。
model_name = re.sub(".*/", "", MODEL_NAME)
current_datetime = datetime.now(timezone(timedelta(hours=+9))).strftime("%Y%m%d_%H%M")
SUBMIT_PATH = f"/content/{model_name}_{current_datetime}_outputs.jsonl"

with open(SUBMIT_PATH, "w", encoding="utf-8") as f:
    for cleaned_result in cleaned_results:
        json.dump(cleaned_result, f, ensure_ascii=False)  # ensure_ascii=False for handling non-ASCII characters
        f.write("\n")

Datasets

Instruction tuning

The models have been fine-tuned on the following datasets.

Language Dataset description
Japanese Ichikara Instruction A manually constructed instruction dataset based on Ichikara Instruction
Japanese Elyza-tasks-100
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Model tree for kmd2525/llm-jp-3-13b-it-r64-ichikara-fix

Finetuned
(1128)
this model