モデル概要

0.3Bクラスの日本語LLM(Mistralモデル)をフルスクラッチ開発しました．

使用方法

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

MODEL_NAME = "asaoka/japanese-mistral-300m-clone"
torch.set_float32_matmul_precision('high')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)

MAX_ASSISTANT_LENGTH = 100
MAX_INPUT_LENGTH = 128
INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n入力:\n{input}\n[SEP]\n応答:\n'
NO_INPUT_PROMPT = r'<s>\n以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n[SEP]\n指示:\n{instruction}\n[SEP]\n応答:\n'

def prepare_input(instruction, input_text):
    if input_text != "":
        prompt = INPUT_PROMPT.format(instruction=instruction, input=input_text)
    else:
        prompt = NO_INPUT_PROMPT.format(instruction=instruction)
    return prompt

def format_output(output):
    output = output.lstrip("<s>").rstrip("</s>").replace("[SEP]", "").replace("\\n", "\n")
    return output

def generate_response(instruction, input_text):
    prompt = prepare_input(instruction, input_text)
    token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    n = len(token_ids[0])

    with torch.no_grad():
        output_ids = model.generate(
            token_ids.to(model.device),
            min_length=n,
            max_length=min(MAX_INPUT_LENGTH, n + MAX_ASSISTANT_LENGTH),
            top_p=0.95,
            top_k=50,
            temperature=0.4,
            do_sample=True,
            no_repeat_ngram_size=2,
            num_beams=3,
            pad_token_id=tokenizer.pad_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            bad_words_ids=[[tokenizer.unk_token_id]]
        )

    output = tokenizer.decode(output_ids.tolist()[0])
    formatted_output_all = format_output(output)
    response = f"Assistant:{formatted_output_all.split('応答:')[-1].strip()}"

    return formatted_output_all, response 

instruction = "あなたは何でも正確に答えられるAIです。"
questions = [
    "日本で一番高い山は？",
    "日本で一番広い湖は？",
    "世界で一番高い山は？",
    "世界で一番広い湖は？",
    "冗談を言ってください。",
]

# 各質問に対して応答を生成して表示
for question in questions:
    formatted_output_all, response = generate_response(instruction, question)
    print(response)

使用方法は，ce-lery/japanese-mistral-300m-instructionを参照しました．

トレーニング

1. 事前学習パイプライン

学習データ
- CC-100データセット(Japanese)（展開後のサイズ：70.9 GB）
- Wikipediaデータセット(jawiki-latest-pages-articles.xml.bz2)（展開後のサイズ：16.2 GB）
データ前処理
トークナイザー学習
事前学習

データ前処理・トークナイザー学習・事前学習は，ce-lery/japanese-mistral-300m-baseを参照しました．

2. ファインチューニング・パイプライン

学習データ
- databricks-dolly-15k-ja（データサイズ：17.1 MB）
データ前処理
インストラクションチューニング

データ前処理・インストラクションチューニングは，ce-lery/japanese-mistral-300m-instructionを参照しました．

JGLUEスコア

タスク	スコア
jsquad-1.2-0.6(exact_match/f1)	0.0/0.8599
marc_ja-1.1-0.6(acc)	0.8428
jcommonsenseqa-1.1-0.6(acc)	0.2672
jnli-1.3-0.6(acc)	0.2482

JGLUEスコアは，Stability AI社のlm-evaluation-harnessを用いて算出しました．JGLUEスコアの算出に用いたスクリプトを下記に示します．

python main.py \
  --model hf-causal-experimental \
  --model_args "pretrained=asaoka/japanese-mistral-300m-clone" \
  --tasks jsquad-1.2-0.6,jcommonsenseqa-1.1-0.6,jnli-1.3-0.6,marc_ja-1.1-0.6 \
  --num_fewshot 2,3,3,3 \
  --device cuda \
  --output_path "./results.json"