satoyutaka commited on
Commit
6b93d17
·
verified ·
1 Parent(s): 71503f7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +32 -19
README.md CHANGED
@@ -37,15 +37,18 @@ from transformers import (
37
 
38
  )
39
 
40
-
41
  import torch
42
  from tqdm import tqdm
43
  import json
44
 
45
 
46
- HF_TOKEN = "Hugging Face Token"
 
 
47
 
48
- model_name = "satoyutaka/llm-jp-3-13b-ftELZ-2"
 
49
 
50
 
51
  bnb_config = BitsAndBytesConfig(
@@ -60,10 +63,12 @@ bnb_config = BitsAndBytesConfig(
60
  )
61
 
62
 
 
 
63
  datasets = []
64
 
65
 
66
- with open("elyza-tasks-100-TV_0.jsonl」のパスをご指定ください。", "r") as f:
67
 
68
  item = ""
69
 
@@ -81,36 +86,39 @@ with open("「elyza-tasks-100-TV_0.jsonl」のパスをご指定ください。"
81
 
82
  results = []
83
 
84
- for data in tqdm(datasets):
85
 
86
- input = data["input"]
87
 
88
- prompt = f"""### 指示
89
- {input}
 
 
 
 
 
 
90
 
91
- """
92
 
93
- tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
94
 
95
- with torch.no_grad():
96
 
97
- outputs = model.generate(
98
 
99
- tokenized_input,
100
 
101
- max_new_tokens=100,
102
 
103
- do_sample=False,
104
 
105
- repetition_penalty=1.2
 
106
 
107
- )[0]
 
108
 
109
- output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
110
 
 
111
 
112
- results.append({"task_id": data["task_id"], "input": input, "output": output})
113
 
 
114
 
115
 
116
  import re
@@ -118,10 +126,15 @@ import re
118
 
119
  model_name = re.sub(".*/", "", model_name)
120
 
 
121
  with open(f"./{model_name}-outputs.jsonl", 'w', encoding='utf-8') as f:
122
 
 
123
  for result in results:
124
 
 
125
  json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
126
 
 
127
  f.write('\n')
 
 
37
 
38
  )
39
 
40
+ ## モデルのロード
41
  import torch
42
  from tqdm import tqdm
43
  import json
44
 
45
 
46
+ HF_TOKEN = "Hugging Face Token" #Hugging Face のAPIキーを入力(read)
47
+
48
+ model_name = "satoyutaka/llm-jp-3-13b-ftELZ-2" #作成したモデル名
49
 
50
+
51
+ ## 量子化パラメータの設定
52
 
53
 
54
  bnb_config = BitsAndBytesConfig(
 
63
  )
64
 
65
 
66
+ ## 問題文の読み込み
67
+
68
  datasets = []
69
 
70
 
71
+ with open("elyza-tasks-100-TV_0.jsonl", "r") as f: #ファイルを格納したパスに書き換えてください。
72
 
73
  item = ""
74
 
 
86
 
87
  results = []
88
 
 
89
 
 
90
 
91
+ ## 推論
92
+
93
+
94
+ from tqdm import tqdm
95
+
96
+
97
+ results = []
98
+
99
 
100
+ for dt in tqdm(datasets):
101
 
 
102
 
103
+ input = dt["input"]
104
 
 
105
 
106
+ prompt = f"""### 指示\n{input}\n### 回答\n"""
107
 
 
108
 
109
+ inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
110
 
111
+
112
+ outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True, do_sample=False, repetition_penalty=1.2)
113
 
114
+
115
+ prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### 回答')[-1]
116
 
 
117
 
118
+ results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
119
 
 
120
 
121
+ ## 提出ファイルの作成
122
 
123
 
124
  import re
 
126
 
127
  model_name = re.sub(".*/", "", model_name)
128
 
129
+
130
  with open(f"./{model_name}-outputs.jsonl", 'w', encoding='utf-8') as f:
131
 
132
+
133
  for result in results:
134
 
135
+
136
  json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
137
 
138
+
139
  f.write('\n')
140
+