Uploaded model

  • Developed by: CLRafaelR
  • License: apache-2.0
  • Finetuned from model : llm-jp/llm-jp-3-13b

This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

ライセンス

cc-by-nc-sa

実行方法

必要パッケージのロード

get_ipython().system("pip install torch==2.2.1+cu121 torchvision --index-url https://download.pytorch.org/whl/cu121")
get_ipython().system(
    'pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
)
get_ipython().system('pip install --no-deps "xformers<0.0.26" --force-reinstall')
get_ipython().system('pip install flash-attn==2.6.3')
get_ipython().system("pip install schedulefree")
get_ipython().system("pip install ipywidgets --upgrade")
get_ipython().system("pip install langchain langchain-community langchain-huggingface faiss-cpu jq polars")


from unsloth import FastLanguageModel
from peft import PeftModel
import torch
import json
from tqdm import tqdm
import re
import gc
import datetime
from transformers.trainer_utils import set_seed
from datasets import load_dataset
import os
import getpass
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import polars as pl
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain_community.vectorstores import FAISS
from pprint import pprint
from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import chain
import time
from transformers import TextStreamer


if not os.environ.get("HF_TOKEN"):
    os.environ["HF_TOKEN"] = getpass.getpass(
        "Enter your Hugging Face API key: ",
    )
    HF_TOKEN = os.environ["HF_TOKEN"]


def flush():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


set_seed(2024)

主機モデルのロード

model_id = "llm-jp/llm-jp-3-13b"
adapter_id = "CLRafaelR/llm-jp-3-13b-ogawa-brewery"

dtype = None  # Noneにしておけば自動で設定
load_in_4bit = True  # 今回は13Bモデルを扱うためTrue

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    trust_remote_code=True,
)

model = PeftModel.from_pretrained(model, adapter_id, token=HF_TOKEN)

RAG構築

下記を参考に作成しました

llm-book/chapter13/13-3-2-rag-instruct-langchain.ipynb at main · ghmagazine/llm-book https://github.com/ghmagazine/llm-book/blob/main/chapter13/13-3-2-rag-instruct-langchain.ipynb

埋め込みモデルのロード

# Hugging Face Hubにおけるモデル名を指定
embedding_model_name = "pkshatech/GLuCoSE-base-ja-v2"

# モデル名からEmbedding Modelを初期化
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={
        "model_kwargs": {
            "torch_dtype": torch.float16,
            # "device": "cuda",
        }
    },
    encode_kwargs={"normalize_embeddings": False},
)

ベクトルデータベースの構築

data_name = "elyza/ELYZA-tasks-100"

ELYZA_tasks_100 = load_dataset(data_name)["test"]

loader = HuggingFaceDatasetLoader(
    data_name,
    "input",
)

documents = loader.load()

vectorstore = FAISS.from_documents(
    documents,
    embedding_model,
    normalize_L2=True,
)


@chain
def retriever(
    query: str,
    k: int = 4,
    score_threshold=0.8,
) -> List[Document]:
    docs, scores = zip(
        *vectorstore.similarity_search_with_relevance_scores(
            query,
            k=k,
            kwargs={
                "score_threshold": score_threshold,
            },
        )
    )
    filtered_docs = []
    for doc, score in zip(docs, scores):
        if score > score_threshold:
            doc.metadata["score"] = score
            print(round(score, 3))
            filtered_docs.append(doc)
    return filtered_docs


retrieved_documents = retriever.invoke(
    "IMEとして機能してください",
    # k=1,
    score_threshold=0.45,
)

評価用データセットの読み込み

datasets = []
with open("../confidential/data/elyza-tasks-100-TV_0.jsonl", "r") as f:
    item = ""
    for line in f:
        line = line.strip()
        item += line
        if item.endswith("}"):
            datasets.append(json.loads(item))
            item = ""

推論

# 推論するためにモデルのモードを変更
FastLanguageModel.for_inference(model)

streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True,
)

results = []
start_time = time.time()
for dt in tqdm(datasets):
    input = dt["input"]

    raw_shots = retriever.invoke(
        input,
        k=1,
        score_threshold=0.45,
    )

    if not raw_shots:
        # ELYZA-tasks-100(オリジナル版)に、いま解こうとしている問題の類似問題がなかった場合
        prompt_inst_answer = f"""### 指示\n\n{input}\n\n### 回答\n\n"""
        # prompt_inst_answer = f"""### 指示\n\n下のタスクへの最終回答に必要な<思考過程>を順序だてて考え、3つの番号付き箇条書きだけで出力してください。\n\nその後で、<最終回答>を出力してください。\n\n### タスク\n\n{input}\n\n### 回答\n\n<思考過程><最終回答>に必要な思考過程3点です。\n\n1. """
    else:
        # ELYZA-tasks-100(オリジナル版)に、いま解こうとしている問題の類似問題があった場合
        shots = []
        for i, raw_shot in enumerate(raw_shots):
            shot = f"""### タスク{i + 1}\n\n{raw_shot.page_content.encode().decode('unicode-escape')}\n\n### タスク{i + 1}の回答\n\n{raw_shot.metadata['output']}"""
            shots.append(shot)
        formatted_shots = "\n\n".join(shots)
        num_shots = len(shots)
        # print(formatted_shots, "\n\n", "=" * 10)
        prompt_inst_answer = f"""### 指示\n\n{input}\n\n### 回答\n\n"""
        # prompt_inst_answer = f"""### 指示\n\n以下の類似したタスクを解いてください。\n\n{formatted_shots}\n\n### タスク{num_shots + 1}\n\n{input}\n\n### タスク{num_shots + 1}の回答\n\n先に解いたタスクと同じ方法で、順序立てて考えます。"""

    print(
        "=" * 16,
        "\n\n",
        prompt_inst_answer,
    )

    inputs = tokenizer(
        [prompt_inst_answer],
        return_tensors="pt",
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        use_cache=True,
        do_sample=False,
        # do_sample=True,
        # num_beams=5,
        repetition_penalty=1.2,
        streamer=streamer,
    )

    prediction = tokenizer.decode(
        outputs[0][inputs.input_ids.shape[-1] :],
        skip_special_tokens=True,
    )

    results.append(
        {
            "task_id": dt["task_id"],
            "input": input,
            "output": prediction,
        }
    )

    flush()

    print("-" * 16)
end_time = time.time()

elapsed_time = datetime.timedelta(seconds=end_time - start_time)
print(f"{elapsed_time} elapsed.")

jsonlファイルとして実行結果を保存

file_name = f"./{adapter_id.split('/')[1]}_output_{datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=9))).strftime('%Y%m%d_%H%M')}"

with open(
    f"{file_name}.jsonl",
    "w",
    encoding="utf-8",
) as f:
    for result in results:
        json.dump(result, f, ensure_ascii=False)
        f.write("\n")

# polarsデータフレームを作成
df = pl.DataFrame(results)

# データフレームをxlsxファイルとして出力
df.write_excel(f"{file_name}.xlsx")
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no pipeline_tag.

Model tree for CLRafaelR/llm-jp-3-13b-ogawa-brewery

Finetuned
(1120)
this model