R-Nakamoto
/

gemma-2-9b-it

Inference Endpoints

Model card Files Files and versions Community

gemma-2-9b-it / README.md

R-Nakamoto's picture

Update README.md

be48767 verified 26 days ago

|

3.65 kB

	---
	library_name: transformers
	license: gemma
	base_model:
	- google/gemma-2-9b
	---
	# 概要
	松尾研大規模言語モデル講座2024のコンペ用の提出モデル作成の一環として作成・公開しています。

	# 推論方法

	以下は、Google Colaboratoryでelyza-tasks-100-TV.jsonlを回答するためのコードです

	```python
	%%capture
	!pip install unsloth
	!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
	!pip install -U torch
	!pip install -U peft

	from peft import PeftModel
	import torch
	import json
	from tqdm import tqdm
	import re

	import os
	HF_TOKEN = 'YOUR_TOKEN'
	os.environ["HF_TOKEN"] = HF_TOKEN

	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

	# モデルとトークナイザーのID
	new_model_id = "R-Nakamoto/gemma-2-9b-it"

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	model = AutoModelForCausalLM.from_pretrained(
	new_model_id,
	quantization_config=bnb_config,
	device_map="auto",
	attn_implementation="eager"
	)
	tokenizer = AutoTokenizer.from_pretrained(
	new_model_id,
	trust_remote_code=True,
	use_fast=True
	)


	datasets = []
	with open("elyza-tasks-100-TV_0.jsonl", "r") as f:
	item = ""
	for line in f:
	line = line.strip()
	item += line
	if item.endswith("}"):
	datasets.append(json.loads(item))
	item = ""

	import torch
	torch.cuda.empty_cache()

	# モデルを推論モードに設定
	model.eval()

	# トークナイザーの設定
	tokenizer.padding_side = 'left'
	model.config.use_cache = True

	# バッチサイズと最大長を設定
	batch_size = 8
	max_length = 512
	results = []

	# プロンプトテンプレート
	template_text = '以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい。\n\n'
	kaitou = '回答'

	# バッチ処理の開始
	for i in tqdm(range(0, len(datasets), batch_size)):
	# バッチ単位でデータを取得
	batch = datasets[i:i + batch_size]

	# プロンプト作成
	prompts = [
	f"### 指示\n{dt['input']}\n### 回答\n"
	for dt in batch
	]

	# トークン化
	inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

	# データを非同期でGPUに転送
	inputs = {key: value.to(model.device, non_blocking=True) for key, value in inputs.items()}

	# 推論の設定
	max_new_tokens = 512
	repetition_penalty = 1.2
	length_penalty = 1.0
	use_cache = True
	num_beams = 3 # ビームサーチのビーム幅

	# 推論（サンプリングを無効化し、決定論的な生成に）
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	repetition_penalty=repetition_penalty,
	length_penalty=length_penalty,
	num_beams=num_beams, # ビームサーチを有効化
	do_sample=False, # サンプリングを無効化
	use_cache=use_cache
	)

	# 結果の処理
	for dt, output in zip(batch, outputs):
	prediction = tokenizer.decode(output, skip_special_tokens=True).split(f"\n### {kaitou}\n")[-1]
	results.append({
	"task_id": dt["task_id"],
	"input": dt["input"],
	"output": prediction
	})

	with open(f"output.jsonl", 'w', encoding='utf-8') as f:
	for result in results:
	json.dump(result, f, ensure_ascii=False)
	f.write('\n')
	```