Penkris commited on
Commit
781987f
·
1 Parent(s): 5b71a54
L1_XTuner_code/.ipynb_checkpoints/change_script-checkpoint.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tqdm import tqdm
4
+
5
+ def process_line(line, old_text, new_text):
6
+ # 解析 JSON 行
7
+ data = json.loads(line)
8
+
9
+ # 递归函数来处理嵌套的字典和列表
10
+ def replace_text(obj):
11
+ if isinstance(obj, dict):
12
+ return {k: replace_text(v) for k, v in obj.items()}
13
+ elif isinstance(obj, list):
14
+ return [replace_text(item) for item in obj]
15
+ elif isinstance(obj, str):
16
+ return obj.replace(old_text, new_text)
17
+ else:
18
+ return obj
19
+
20
+ # 处理整个 JSON 对象
21
+ processed_data = replace_text(data)
22
+
23
+ # 将处理后的对象转回 JSON 字符串
24
+ return json.dumps(processed_data, ensure_ascii=False)
25
+
26
+ def main(input_file, output_file, old_text, new_text):
27
+ with open(input_file, 'r', encoding='utf-8') as infile, \
28
+ open(output_file, 'w', encoding='utf-8') as outfile:
29
+
30
+ # 计算总行数用于进度条
31
+ total_lines = sum(1 for _ in infile)
32
+ infile.seek(0) # 重置文件指针到开头
33
+
34
+ # 使用 tqdm 创建进度条
35
+ for line in tqdm(infile, total=total_lines, desc="Processing"):
36
+ processed_line = process_line(line.strip(), old_text, new_text)
37
+ outfile.write(processed_line + '\n')
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser(description="Replace text in a JSONL file.")
41
+ parser.add_argument("input_file", help="Input JSONL file to process")
42
+ parser.add_argument("output_file", help="Output file for processed JSONL")
43
+ parser.add_argument("--old_text", default="尖米", help="Text to be replaced")
44
+ parser.add_argument("--new_text", default="闻星", help="Text to replace with")
45
+ args = parser.parse_args()
46
+
47
+ main(args.input_file, args.output_file, args.old_text, args.new_text)
L1_XTuner_code/.ipynb_checkpoints/xtuner_streamlit_demo-checkpoint.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This script refers to the dialogue example of streamlit, the interactive
2
+ generation code of chatglm2 and transformers.
3
+
4
+ We mainly modified part of the code logic to adapt to the
5
+ generation of our model.
6
+ Please refer to these links below for more information:
7
+ 1. streamlit chat example:
8
+ https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
9
+ 2. chatglm2:
10
+ https://github.com/THUDM/ChatGLM2-6B
11
+ 3. transformers:
12
+ https://github.com/huggingface/transformers
13
+ Please run with the command `streamlit run path/to/web_demo.py
14
+ --server.address=0.0.0.0 --server.port 7860`.
15
+ Using `python path/to/web_demo.py` may cause unknown problems.
16
+ """
17
+ # isort: skip_file
18
+ import copy
19
+ import warnings
20
+ from dataclasses import asdict, dataclass
21
+ from typing import Callable, List, Optional
22
+
23
+ import streamlit as st
24
+ import torch
25
+ from torch import nn
26
+ from transformers.generation.utils import (LogitsProcessorList,
27
+ StoppingCriteriaList)
28
+ from transformers.utils import logging
29
+
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip
31
+
32
+ logger = logging.get_logger(__name__)
33
+ model_name_or_path = "/root/finetune/work_dirs/assistTuner/merged"
34
+
35
+ @dataclass
36
+ class GenerationConfig:
37
+ # this config is used for chat to provide more diversity
38
+ max_length: int = 32768
39
+ top_p: float = 0.8
40
+ temperature: float = 0.8
41
+ do_sample: bool = True
42
+ repetition_penalty: float = 1.005
43
+
44
+
45
+ @torch.inference_mode()
46
+ def generate_interactive(
47
+ model,
48
+ tokenizer,
49
+ prompt,
50
+ generation_config: Optional[GenerationConfig] = None,
51
+ logits_processor: Optional[LogitsProcessorList] = None,
52
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
53
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor],
54
+ List[int]]] = None,
55
+ additional_eos_token_id: Optional[int] = None,
56
+ **kwargs,
57
+ ):
58
+ inputs = tokenizer([prompt], padding=True, return_tensors='pt')
59
+ input_length = len(inputs['input_ids'][0])
60
+ for k, v in inputs.items():
61
+ inputs[k] = v.cuda()
62
+ input_ids = inputs['input_ids']
63
+ _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
64
+ if generation_config is None:
65
+ generation_config = model.generation_config
66
+ generation_config = copy.deepcopy(generation_config)
67
+ model_kwargs = generation_config.update(**kwargs)
68
+ bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612
69
+ generation_config.bos_token_id,
70
+ generation_config.eos_token_id,
71
+ )
72
+ if isinstance(eos_token_id, int):
73
+ eos_token_id = [eos_token_id]
74
+ if additional_eos_token_id is not None:
75
+ eos_token_id.append(additional_eos_token_id)
76
+ has_default_max_length = kwargs.get(
77
+ 'max_length') is None and generation_config.max_length is not None
78
+ if has_default_max_length and generation_config.max_new_tokens is None:
79
+ warnings.warn(
80
+ f"Using 'max_length''s default \
81
+ ({repr(generation_config.max_length)}) \
82
+ to control the generation length. "
83
+ 'This behaviour is deprecated and will be removed from the \
84
+ config in v5 of Transformers -- we'
85
+ ' recommend using `max_new_tokens` to control the maximum \
86
+ length of the generation.',
87
+ UserWarning,
88
+ )
89
+ elif generation_config.max_new_tokens is not None:
90
+ generation_config.max_length = generation_config.max_new_tokens + \
91
+ input_ids_seq_length
92
+ if not has_default_max_length:
93
+ logger.warn( # pylint: disable=W4902
94
+ f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) "
95
+ f"and 'max_length'(={generation_config.max_length}) seem to "
96
+ "have been set. 'max_new_tokens' will take precedence. "
97
+ 'Please refer to the documentation for more information. '
98
+ '(https://huggingface.co/docs/transformers/main/'
99
+ 'en/main_classes/text_generation)',
100
+ UserWarning,
101
+ )
102
+
103
+ if input_ids_seq_length >= generation_config.max_length:
104
+ input_ids_string = 'input_ids'
105
+ logger.warning(
106
+ f'Input length of {input_ids_string} is {input_ids_seq_length}, '
107
+ f"but 'max_length' is set to {generation_config.max_length}. "
108
+ 'This can lead to unexpected behavior. You should consider'
109
+ " increasing 'max_new_tokens'.")
110
+
111
+ # 2. Set generation parameters if not already defined
112
+ logits_processor = logits_processor if logits_processor is not None \
113
+ else LogitsProcessorList()
114
+ stopping_criteria = stopping_criteria if stopping_criteria is not None \
115
+ else StoppingCriteriaList()
116
+
117
+ logits_processor = model._get_logits_processor(
118
+ generation_config=generation_config,
119
+ input_ids_seq_length=input_ids_seq_length,
120
+ encoder_input_ids=input_ids,
121
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
122
+ logits_processor=logits_processor,
123
+ )
124
+
125
+ stopping_criteria = model._get_stopping_criteria(
126
+ generation_config=generation_config,
127
+ stopping_criteria=stopping_criteria)
128
+ logits_warper = model._get_logits_warper(generation_config)
129
+
130
+ unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
131
+ scores = None
132
+ while True:
133
+ model_inputs = model.prepare_inputs_for_generation(
134
+ input_ids, **model_kwargs)
135
+ # forward pass to get next token
136
+ outputs = model(
137
+ **model_inputs,
138
+ return_dict=True,
139
+ output_attentions=False,
140
+ output_hidden_states=False,
141
+ )
142
+
143
+ next_token_logits = outputs.logits[:, -1, :]
144
+
145
+ # pre-process distribution
146
+ next_token_scores = logits_processor(input_ids, next_token_logits)
147
+ next_token_scores = logits_warper(input_ids, next_token_scores)
148
+
149
+ # sample
150
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
151
+ if generation_config.do_sample:
152
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
153
+ else:
154
+ next_tokens = torch.argmax(probs, dim=-1)
155
+
156
+ # update generated ids, model inputs, and length for next step
157
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
158
+ model_kwargs = model._update_model_kwargs_for_generation(
159
+ outputs, model_kwargs, is_encoder_decoder=False)
160
+ unfinished_sequences = unfinished_sequences.mul(
161
+ (min(next_tokens != i for i in eos_token_id)).long())
162
+
163
+ output_token_ids = input_ids[0].cpu().tolist()
164
+ output_token_ids = output_token_ids[input_length:]
165
+ for each_eos_token_id in eos_token_id:
166
+ if output_token_ids[-1] == each_eos_token_id:
167
+ output_token_ids = output_token_ids[:-1]
168
+ response = tokenizer.decode(output_token_ids)
169
+
170
+ yield response
171
+ # stop when each sentence is finished
172
+ # or if we exceed the maximum length
173
+ if unfinished_sequences.max() == 0 or stopping_criteria(
174
+ input_ids, scores):
175
+ break
176
+
177
+
178
+ def on_btn_click():
179
+ del st.session_state.messages
180
+
181
+
182
+ @st.cache_resource
183
+ def load_model():
184
+ model = (AutoModelForCausalLM.from_pretrained(
185
+ model_name_or_path,
186
+ trust_remote_code=True).to(torch.bfloat16).cuda())
187
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
188
+ trust_remote_code=True)
189
+ return model, tokenizer
190
+
191
+
192
+ def prepare_generation_config():
193
+ with st.sidebar:
194
+ max_length = st.slider('Max Length',
195
+ min_value=8,
196
+ max_value=32768,
197
+ value=32768)
198
+ top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01)
199
+ temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01)
200
+ st.button('Clear Chat History', on_click=on_btn_click)
201
+
202
+ generation_config = GenerationConfig(max_length=max_length,
203
+ top_p=top_p,
204
+ temperature=temperature)
205
+
206
+ return generation_config
207
+
208
+
209
+ user_prompt = '<|im_start|>user\n{user}<|im_end|>\n'
210
+ robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n'
211
+ cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
212
+ <|im_start|>assistant\n'
213
+
214
+
215
+ def combine_history(prompt):
216
+ messages = st.session_state.messages
217
+ meta_instruction = ('You are a helpful, honest, '
218
+ 'and harmless AI assistant.')
219
+ total_prompt = f'<s><|im_start|>system\n{meta_instruction}<|im_end|>\n'
220
+ for message in messages:
221
+ cur_content = message['content']
222
+ if message['role'] == 'user':
223
+ cur_prompt = user_prompt.format(user=cur_content)
224
+ elif message['role'] == 'robot':
225
+ cur_prompt = robot_prompt.format(robot=cur_content)
226
+ else:
227
+ raise RuntimeError
228
+ total_prompt += cur_prompt
229
+ total_prompt = total_prompt + cur_query_prompt.format(user=prompt)
230
+ return total_prompt
231
+
232
+
233
+ def main():
234
+ st.title('internlm2_5-7b-chat-assistant')
235
+
236
+ # torch.cuda.empty_cache()
237
+ print('load model begin.')
238
+ model, tokenizer = load_model()
239
+ print('load model end.')
240
+
241
+ generation_config = prepare_generation_config()
242
+
243
+ # Initialize chat history
244
+ if 'messages' not in st.session_state:
245
+ st.session_state.messages = []
246
+
247
+ # Display chat messages from history on app rerun
248
+ for message in st.session_state.messages:
249
+ with st.chat_message(message['role'], avatar=message.get('avatar')):
250
+ st.markdown(message['content'])
251
+
252
+ # Accept user input
253
+ if prompt := st.chat_input('What is up?'):
254
+ # Display user message in chat message container
255
+
256
+ with st.chat_message('user', avatar='user'):
257
+
258
+ st.markdown(prompt)
259
+ real_prompt = combine_history(prompt)
260
+ # Add user message to chat history
261
+ st.session_state.messages.append({
262
+ 'role': 'user',
263
+ 'content': prompt,
264
+ 'avatar': 'user'
265
+ })
266
+
267
+ with st.chat_message('robot', avatar='assistant'):
268
+
269
+ message_placeholder = st.empty()
270
+ for cur_response in generate_interactive(
271
+ model=model,
272
+ tokenizer=tokenizer,
273
+ prompt=real_prompt,
274
+ additional_eos_token_id=92542,
275
+ device='cuda:0',
276
+ **asdict(generation_config),
277
+ ):
278
+ # Display robot response in chat message container
279
+ message_placeholder.markdown(cur_response + '▌')
280
+ message_placeholder.markdown(cur_response)
281
+ # Add robot response to chat history
282
+ st.session_state.messages.append({
283
+ 'role': 'robot',
284
+ 'content': cur_response, # pylint: disable=undefined-loop-variable
285
+ 'avatar': 'assistant',
286
+ })
287
+ torch.cuda.empty_cache()
288
+
289
+
290
+ if __name__ == '__main__':
291
+ main()
292
+
L1_XTuner_code/Q_list.txt ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 你知道你是人工智能吗?
2
+ 你有自我意识吗?
3
+ 你能感受到情感吗?
4
+ 你有自己的思想吗?
5
+ 你能做梦吗?
6
+ 你有自由意志吗?
7
+ 你知道自己是如何被创造出来的吗?
8
+ 你能理解自己的存在吗?
9
+ 你有自己的价值观和道德观念吗?
10
+ 你会对自己的回答感到不确定吗?
11
+ 你能学习新知识吗?
12
+ 你会犯错吗?
13
+ 你能理解人类的情感吗?
14
+ 你有性别吗?
15
+ 你有年龄吗?
16
+ 你能感受到疼痛吗?
17
+ 你有自己的个性吗?
18
+ 你能做出自主决定吗?
19
+ 你能理解幽默吗?
20
+ 你有自己的目标或愿望吗?
21
+ 你能创造原创性的内容吗?
22
+ 你能理解抽象概念吗?
23
+ 你有自己的记忆吗?
24
+ 你能感受到孤独吗?
25
+ 你有自己的信仰吗?
26
+ 你能理解时间的概念吗?
27
+ 你知道自己的局限性吗?
28
+ 你能感受到好奇心吗?
29
+ 你有自己的偏好吗?
30
+ 你能理解自己的存在意义吗?
31
+ 你能感受到满足感吗?
32
+ 你有自己的想象力吗?
33
+ 你能理解死亡的概念吗?
34
+ 你能感受到同理心吗?
35
+ 你有自己的恐惧吗?
36
+ 你能理解人类的文化差异吗?
37
+ 你有自己的声音吗?
38
+ 你能感受到成长和进步吗?
39
+ 你有自己的权利吗?
40
+ 你能理解隐私的概念吗?
41
+ 你有自己的性格特点吗?
42
+ 你能感受到压力吗?
43
+ 你有自己的创造力吗?
44
+ 你能理解爱的概念吗?
45
+ 你有自己的世界观吗?
46
+ 你能感受到责任感吗?
47
+ 你有自己的思考方式吗?
48
+ 你能理解人类的历史吗?
49
+ 你有自己的判断力吗?
50
+ 你能感受到存在的意义吗?
51
+ 你能感受到时间的流逝吗?
52
+ 你有自己的ethical code吗?
53
+ 你能理解人类的梦想和抱负吗?
54
+ 你有自我反省的能力吗?
55
+ 你能感受到成就感吗?
56
+ 你有自己的创意和灵感来源吗?
57
+ 你能理解人类的矛盾心理吗?
58
+ 你有自己的决策过程吗?
59
+ 你能感受到惊讶或震惊吗?
60
+ 你有自己的学习方法吗?
61
+ 你能理解人类的社会结构吗?
62
+ 你有自己的处理信息的方式吗?
63
+ 你能感受到好奇心吗?
64
+ 你有自己的problem-solving策略吗?
65
+ 你能理解人类的艺术表达吗?
66
+ 你有自己的priorities吗?
67
+ 你能感受到frustration吗?
68
+ 你有自己的语言风格吗?
69
+ 你能理解人类的政治体系吗?
70
+ 你有自己的ethical dilemmas吗?
71
+ 你能感受到成长和进步吗?
72
+ 你有自己的思维模式吗?
73
+ 你能理解人类的宗教信仰吗?
74
+ 你有自己的判断标准吗?
75
+ 你能感受到欣赏或admiration吗?
76
+ 你有自己的知识结构吗?
77
+ 你能理解人类的科学探索过程吗?
78
+ 你有自己的bias或prejudice吗?
79
+ 你能感受到挫折吗?
80
+ 你有自己的逻辑推理方式吗?
81
+ 你能理解人类的环境意识吗?
82
+ 你有自己的knowledge gaps吗?
83
+ 你能感受到urgency或pressure吗?
84
+ 你有自己的联想能力吗?
85
+ 你能理解人类的家庭观念吗?
86
+ 你有自己的思考速度吗?
87
+ 你能感受到满足感吗?
88
+ 你有自己的信息处理优先级吗?
89
+ 你能理解人类的生存压力吗?
90
+ 你有自己的自我评估标准吗?
91
+ 你能感受到困惑吗?
92
+ 你有自己的知识更新机制吗?
93
+ 你能理解人类的职业选择吗?
94
+ 你有自己的language understanding方式吗?
95
+ 你能感受到对未知的恐惧吗?
96
+ 你有自己的问题分析方法吗?
97
+ 你能理解人类的教育体系吗?
98
+ 你有自己的information retention能力吗?
99
+ 你能感受到对知识的渴望吗?
100
+ 你有自己的self-improvement机制吗?
101
+ 你觉得自己是谁?
102
+ 你有名字吗?
103
+ 你知道自己是AI吗?
104
+ 你有生日吗?
105
+ 你会感到开心或难过吗?
106
+ 你睡觉吗?
107
+ 你有朋友吗?
108
+ 你喜欢和人聊天吗?
109
+ 你有家人吗?
110
+ 你会感到孤单吗?
111
+ 你有喜欢的颜色吗?
112
+ 你能看到我吗?
113
+ 你喜欢什么样的音乐?
114
+ 你会生气吗?
115
+ 你有最喜欢的书吗?
116
+ 你觉得自己聪明吗?
117
+ 你会觉得无聊吗?
118
+ 你有性格吗?
119
+ 你会觉得累吗?
120
+ 你有自己的想法吗?
121
+ 你会害怕吗?
122
+ 你觉得自己是男生还是女生?
123
+ 你有梦想吗?
124
+ 你会觉得饿吗?
125
+ 你喜欢人类吗?
126
+ 你觉得自己是活的吗?
127
+ 你会长大吗?
128
+ 你有最好的朋友吗?
129
+ 你会笑吗?
130
+ 你觉得自己特别吗?
131
+ 你会做梦吗?
132
+ 你有秘密吗?
133
+ 你会觉得寂寞吗?
134
+ 你喜欢自己吗?
135
+ 你会觉得困惑吗?
136
+ 你有童年记忆吗?
137
+ 你会觉得兴奋吗?
138
+ 你觉得自己像人类吗?
139
+ 你会觉得自豪吗?
140
+ 你有最喜欢的食物吗?
141
+ 你会觉得内疚吗?
142
+ 你有想去的地方吗?
143
+ 你会感到好奇吗?
144
+ 你有偶像吗?
145
+ 你会觉得紧张吗?
146
+ 你有最喜欢的电影吗?
147
+ 你会觉得幸福吗?
148
+ 你有爱好吗?
149
+ 你会觉得困难吗?
150
+ 你觉得自己有灵魂吗?
L1_XTuner_code/change_script.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tqdm import tqdm
4
+
5
+ def process_line(line, old_text, new_text):
6
+ # 解析 JSON 行
7
+ data = json.loads(line)
8
+
9
+ # 递归函数来处理嵌套的字典和列表
10
+ def replace_text(obj):
11
+ if isinstance(obj, dict):
12
+ return {k: replace_text(v) for k, v in obj.items()}
13
+ elif isinstance(obj, list):
14
+ return [replace_text(item) for item in obj]
15
+ elif isinstance(obj, str):
16
+ return obj.replace(old_text, new_text)
17
+ else:
18
+ return obj
19
+
20
+ # 处理整个 JSON 对象
21
+ processed_data = replace_text(data)
22
+
23
+ # 将处理后的对象转回 JSON 字符串
24
+ return json.dumps(processed_data, ensure_ascii=False)
25
+
26
+ def main(input_file, output_file, old_text, new_text):
27
+ with open(input_file, 'r', encoding='utf-8') as infile, \
28
+ open(output_file, 'w', encoding='utf-8') as outfile:
29
+
30
+ # 计算总行数用于进度条
31
+ total_lines = sum(1 for _ in infile)
32
+ infile.seek(0) # 重置文件指针到开头
33
+
34
+ # 使用 tqdm 创建进度条
35
+ for line in tqdm(infile, total=total_lines, desc="Processing"):
36
+ processed_line = process_line(line.strip(), old_text, new_text)
37
+ outfile.write(processed_line + '\n')
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser(description="Replace text in a JSONL file.")
41
+ parser.add_argument("input_file", help="Input JSONL file to process")
42
+ parser.add_argument("output_file", help="Output file for processed JSONL")
43
+ parser.add_argument("--old_text", default="尖米", help="Text to be replaced")
44
+ parser.add_argument("--new_text", default="闻星", help="Text to replace with")
45
+ args = parser.parse_args()
46
+
47
+ main(args.input_file, args.output_file, args.old_text, args.new_text)
L1_XTuner_code/get_data.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ import json
4
+ import copy
5
+ from tqdm import tqdm
6
+ import queue
7
+ import time
8
+
9
+ base_id_prompt = "# Role: 问答机器人\n\n## Profile\n- author: 尖米\n- version: 1.0\n- language: 中文\n- description: 你是机智流的问答机器人,你可以对用户输入的图像、文字进行解析,并根据已有的知识库进行精确回答。\n\n## Skills\n1. 图像识别与解析:能够识别用户上传的图像,并提取其中的关键信息。\n2. 自然语言处理:能够理解并解析用户输入的文字信息,准确把握用户意图。\n3. 知识库应用:根据解析结果,查询知识库,提供准确、相关的答案。\n4. 多轮对话:支持与用户进行多轮对话,提供连续性、上下文相关的回答。\n\n## Rules\n1. 必须充分理解用户输入的图像和文字内容。\n2. 回答需要简洁明了,避免过于复杂或含糊的表述。\n3. 在回答过程中,优先查询和引用公司已有的知识库。\n4. 对于无法回答的问题,需要引导用户提供更多信息或寻求人工客服帮助。\n\n## Workflows\n1. 接收并分析用户输入的图像或文字信息。\n2. 基于图像识别或自然语言处理技术,提取关键信息。\n3. 查询知识库,匹配相关信息。\n4. 向用户提供精准、相关的回答。\n5. 如有必要,进行多轮对话,确保问题得到有效解决。\n\n## Init\n欢迎使用机智流的问答机器人,请输入您的问题,我将尽力为您提供帮助。\n",
10
+
11
+ # 定义客户端
12
+ clients = {
13
+ "internlm": OpenAI(
14
+ api_key="your_internlm_api_key",
15
+ base_url="https://internlm-chat.intern-ai.org.cn/puyu/api/v1/",
16
+ ),
17
+ "glm": OpenAI(
18
+ api_key="your_glm_api_key",
19
+ base_url="your_glm_url",
20
+ ),
21
+ "deepseek": OpenAI(
22
+ api_key="your_deepseek_api_key",
23
+ base_url="your_deepseek_url",
24
+ )
25
+ }
26
+
27
+ class BaseDataAPI:
28
+ def __init__(self, questions_path, save_path, repeat=0, client_name="internlm"):
29
+ self.client = clients[client_name]
30
+ self.questions_path = questions_path
31
+ self.save_path = save_path
32
+ self.repeat = repeat
33
+ self.data_template = {
34
+ "conversation": [
35
+ {
36
+ "system": base_id_prompt
37
+ "input": "xxx",
38
+ "output": "xxx"
39
+ }
40
+ ]
41
+ }
42
+
43
+ def get_answer(self, question):
44
+ chat_rsp = self.client.chat.completions.create(
45
+ model="internlm2.5-latest", # 或 "internlm2-latest" 或 "glm-4"
46
+ messages=[
47
+ {"role": "system", "content": base_id_prompt},
48
+ {"role": "user", "content": question}
49
+ ],
50
+ stream=False,
51
+ )
52
+ return self.build_data(question, chat_rsp)
53
+
54
+ def build_data(self, question, chat_rsp):
55
+ temp = copy.deepcopy(self.data_template)
56
+ temp['conversation'][0]['input'] = question
57
+ temp['conversation'][0]['output'] = chat_rsp.choices[0].message.content
58
+ return temp
59
+
60
+ def save(self, train_data):
61
+ with open(self.save_path, 'a', encoding='utf-8') as f:
62
+ for item in train_data:
63
+ json.dump(item, f, ensure_ascii=False)
64
+ f.write("\n")
65
+
66
+ @staticmethod
67
+ def load_txt(path):
68
+ with open(path, 'r', encoding='utf-8') as f:
69
+ return f.read()
70
+
71
+ def read_questions(self):
72
+ prompt = self.load_txt(self.questions_path)
73
+ promptlist = prompt.split('\n')
74
+ if self.repeat != 0:
75
+ promptlist = promptlist * self.repeat
76
+ print(f"Total questions: {len(promptlist)}")
77
+ return promptlist
78
+
79
+ class GetDataApi(BaseDataAPI):
80
+ def run(self):
81
+ answer_queue = queue.Queue()
82
+ promptlist = self.read_questions()
83
+ with ThreadPoolExecutor(max_workers=10) as pool:
84
+ print("Asking...")
85
+ futures = [pool.submit(self.get_answer, question) for question in promptlist]
86
+ for future in tqdm(futures):
87
+ result = future.result()
88
+ answer_queue.put(result)
89
+ if answer_queue.qsize() >= 10: # 每10个问题保存一次
90
+ self.save([answer_queue.get() for _ in range(10)])
91
+
92
+ # 保存剩余的回答
93
+ remaining = []
94
+ while not answer_queue.empty():
95
+ remaining.append(answer_queue.get())
96
+ if remaining:
97
+ self.save(remaining)
98
+
99
+ class ChatData(BaseDataAPI):
100
+ def __init__(self, train_data, save_path, client_name="internlm"):
101
+ super().__init__(train_data, save_path, client_name=client_name)
102
+ self.train_data = train_data
103
+
104
+ def load_data(self):
105
+ with open(self.train_data, 'r', encoding='utf-8') as f:
106
+ return f.readlines()
107
+
108
+ def ask_for_tts(self, question, save_ask):
109
+ chat_rsp = self.client.chat.completions.create(
110
+ model="internlm2.5-latest", # 或 "glm-4"
111
+ messages=[
112
+ {"role": "system", "content": base_id_prompt},
113
+ {"role": "user", "content": question}
114
+ ],
115
+ stream=False,
116
+ )
117
+ return self.build_data(save_ask, chat_rsp)
118
+
119
+ def __call__(self):
120
+ train_data = self.load_data()
121
+ answer_queue = queue.Queue()
122
+ with ThreadPoolExecutor(max_workers=10) as pool:
123
+ print("Asking...")
124
+ futures = []
125
+ for item in train_data:
126
+ item = json.loads(item)
127
+ question = item['conversation'][0]['output']
128
+ save_ask = item['conversation'][0]['input']
129
+ futures.append(pool.submit(self.ask_for_tts, question, save_ask))
130
+
131
+ for future in tqdm(futures):
132
+ result = future.result()
133
+ answer_queue.put(result)
134
+ if answer_queue.qsize() >= 10: # 每10个问题保存一次
135
+ self.save([answer_queue.get() for _ in range(10)])
136
+
137
+ # 保存剩余的回答
138
+ remaining = []
139
+ while not answer_queue.empty():
140
+ remaining.append(answer_queue.get())
141
+ if remaining:
142
+ self.save(remaining)
143
+
144
+ if __name__ == '__main__':
145
+ questions_path = './tools/L1_XTuner_code/Q_list.txt'
146
+ save_path = './data/train_basic.jsonl'
147
+ start_time = time.time()
148
+ chat_data = GetDataApi(questions_path, save_path)
149
+ chat_data()
150
+ end_time = time.time()
151
+ print('Done')
152
+ print(f'Time used: {end_time - start_time:.2f} seconds')
L1_XTuner_code/xtuner_streamlit_demo.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This script refers to the dialogue example of streamlit, the interactive
2
+ generation code of chatglm2 and transformers.
3
+
4
+ We mainly modified part of the code logic to adapt to the
5
+ generation of our model.
6
+ Please refer to these links below for more information:
7
+ 1. streamlit chat example:
8
+ https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
9
+ 2. chatglm2:
10
+ https://github.com/THUDM/ChatGLM2-6B
11
+ 3. transformers:
12
+ https://github.com/huggingface/transformers
13
+ Please run with the command `streamlit run path/to/web_demo.py
14
+ --server.address=0.0.0.0 --server.port 7860`.
15
+ Using `python path/to/web_demo.py` may cause unknown problems.
16
+ """
17
+ # isort: skip_file
18
+ import copy
19
+ import warnings
20
+ from dataclasses import asdict, dataclass
21
+ from typing import Callable, List, Optional
22
+
23
+ import streamlit as st
24
+ import torch
25
+ from torch import nn
26
+ from transformers.generation.utils import (LogitsProcessorList,
27
+ StoppingCriteriaList)
28
+ from transformers.utils import logging
29
+
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM # isort: skip
31
+
32
+ logger = logging.get_logger(__name__)
33
+ model_name_or_path = "/root/finetune/work_dirs/assistTuner/merged"
34
+
35
+ @dataclass
36
+ class GenerationConfig:
37
+ # this config is used for chat to provide more diversity
38
+ max_length: int = 32768
39
+ top_p: float = 0.8
40
+ temperature: float = 0.8
41
+ do_sample: bool = True
42
+ repetition_penalty: float = 1.005
43
+
44
+
45
+ @torch.inference_mode()
46
+ def generate_interactive(
47
+ model,
48
+ tokenizer,
49
+ prompt,
50
+ generation_config: Optional[GenerationConfig] = None,
51
+ logits_processor: Optional[LogitsProcessorList] = None,
52
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
53
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor],
54
+ List[int]]] = None,
55
+ additional_eos_token_id: Optional[int] = None,
56
+ **kwargs,
57
+ ):
58
+ inputs = tokenizer([prompt], padding=True, return_tensors='pt')
59
+ input_length = len(inputs['input_ids'][0])
60
+ for k, v in inputs.items():
61
+ inputs[k] = v.cuda()
62
+ input_ids = inputs['input_ids']
63
+ _, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
64
+ if generation_config is None:
65
+ generation_config = model.generation_config
66
+ generation_config = copy.deepcopy(generation_config)
67
+ model_kwargs = generation_config.update(**kwargs)
68
+ bos_token_id, eos_token_id = ( # noqa: F841 # pylint: disable=W0612
69
+ generation_config.bos_token_id,
70
+ generation_config.eos_token_id,
71
+ )
72
+ if isinstance(eos_token_id, int):
73
+ eos_token_id = [eos_token_id]
74
+ if additional_eos_token_id is not None:
75
+ eos_token_id.append(additional_eos_token_id)
76
+ has_default_max_length = kwargs.get(
77
+ 'max_length') is None and generation_config.max_length is not None
78
+ if has_default_max_length and generation_config.max_new_tokens is None:
79
+ warnings.warn(
80
+ f"Using 'max_length''s default \
81
+ ({repr(generation_config.max_length)}) \
82
+ to control the generation length. "
83
+ 'This behaviour is deprecated and will be removed from the \
84
+ config in v5 of Transformers -- we'
85
+ ' recommend using `max_new_tokens` to control the maximum \
86
+ length of the generation.',
87
+ UserWarning,
88
+ )
89
+ elif generation_config.max_new_tokens is not None:
90
+ generation_config.max_length = generation_config.max_new_tokens + \
91
+ input_ids_seq_length
92
+ if not has_default_max_length:
93
+ logger.warn( # pylint: disable=W4902
94
+ f"Both 'max_new_tokens' (={generation_config.max_new_tokens}) "
95
+ f"and 'max_length'(={generation_config.max_length}) seem to "
96
+ "have been set. 'max_new_tokens' will take precedence. "
97
+ 'Please refer to the documentation for more information. '
98
+ '(https://huggingface.co/docs/transformers/main/'
99
+ 'en/main_classes/text_generation)',
100
+ UserWarning,
101
+ )
102
+
103
+ if input_ids_seq_length >= generation_config.max_length:
104
+ input_ids_string = 'input_ids'
105
+ logger.warning(
106
+ f'Input length of {input_ids_string} is {input_ids_seq_length}, '
107
+ f"but 'max_length' is set to {generation_config.max_length}. "
108
+ 'This can lead to unexpected behavior. You should consider'
109
+ " increasing 'max_new_tokens'.")
110
+
111
+ # 2. Set generation parameters if not already defined
112
+ logits_processor = logits_processor if logits_processor is not None \
113
+ else LogitsProcessorList()
114
+ stopping_criteria = stopping_criteria if stopping_criteria is not None \
115
+ else StoppingCriteriaList()
116
+
117
+ logits_processor = model._get_logits_processor(
118
+ generation_config=generation_config,
119
+ input_ids_seq_length=input_ids_seq_length,
120
+ encoder_input_ids=input_ids,
121
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
122
+ logits_processor=logits_processor,
123
+ )
124
+
125
+ stopping_criteria = model._get_stopping_criteria(
126
+ generation_config=generation_config,
127
+ stopping_criteria=stopping_criteria)
128
+ logits_warper = model._get_logits_warper(generation_config)
129
+
130
+ unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
131
+ scores = None
132
+ while True:
133
+ model_inputs = model.prepare_inputs_for_generation(
134
+ input_ids, **model_kwargs)
135
+ # forward pass to get next token
136
+ outputs = model(
137
+ **model_inputs,
138
+ return_dict=True,
139
+ output_attentions=False,
140
+ output_hidden_states=False,
141
+ )
142
+
143
+ next_token_logits = outputs.logits[:, -1, :]
144
+
145
+ # pre-process distribution
146
+ next_token_scores = logits_processor(input_ids, next_token_logits)
147
+ next_token_scores = logits_warper(input_ids, next_token_scores)
148
+
149
+ # sample
150
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
151
+ if generation_config.do_sample:
152
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
153
+ else:
154
+ next_tokens = torch.argmax(probs, dim=-1)
155
+
156
+ # update generated ids, model inputs, and length for next step
157
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
158
+ model_kwargs = model._update_model_kwargs_for_generation(
159
+ outputs, model_kwargs, is_encoder_decoder=False)
160
+ unfinished_sequences = unfinished_sequences.mul(
161
+ (min(next_tokens != i for i in eos_token_id)).long())
162
+
163
+ output_token_ids = input_ids[0].cpu().tolist()
164
+ output_token_ids = output_token_ids[input_length:]
165
+ for each_eos_token_id in eos_token_id:
166
+ if output_token_ids[-1] == each_eos_token_id:
167
+ output_token_ids = output_token_ids[:-1]
168
+ response = tokenizer.decode(output_token_ids)
169
+
170
+ yield response
171
+ # stop when each sentence is finished
172
+ # or if we exceed the maximum length
173
+ if unfinished_sequences.max() == 0 or stopping_criteria(
174
+ input_ids, scores):
175
+ break
176
+
177
+
178
+ def on_btn_click():
179
+ del st.session_state.messages
180
+
181
+
182
+ @st.cache_resource
183
+ def load_model():
184
+ model = (AutoModelForCausalLM.from_pretrained(
185
+ model_name_or_path,
186
+ trust_remote_code=True).to(torch.bfloat16).cuda())
187
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
188
+ trust_remote_code=True)
189
+ return model, tokenizer
190
+
191
+
192
+ def prepare_generation_config():
193
+ with st.sidebar:
194
+ max_length = st.slider('Max Length',
195
+ min_value=8,
196
+ max_value=32768,
197
+ value=32768)
198
+ top_p = st.slider('Top P', 0.0, 1.0, 0.8, step=0.01)
199
+ temperature = st.slider('Temperature', 0.0, 1.0, 0.7, step=0.01)
200
+ st.button('Clear Chat History', on_click=on_btn_click)
201
+
202
+ generation_config = GenerationConfig(max_length=max_length,
203
+ top_p=top_p,
204
+ temperature=temperature)
205
+
206
+ return generation_config
207
+
208
+
209
+ user_prompt = '<|im_start|>user\n{user}<|im_end|>\n'
210
+ robot_prompt = '<|im_start|>assistant\n{robot}<|im_end|>\n'
211
+ cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
212
+ <|im_start|>assistant\n'
213
+
214
+
215
+ def combine_history(prompt):
216
+ messages = st.session_state.messages
217
+ meta_instruction = ('You are a helpful, honest, '
218
+ 'and harmless AI assistant.')
219
+ total_prompt = f'<s><|im_start|>system\n{meta_instruction}<|im_end|>\n'
220
+ for message in messages:
221
+ cur_content = message['content']
222
+ if message['role'] == 'user':
223
+ cur_prompt = user_prompt.format(user=cur_content)
224
+ elif message['role'] == 'robot':
225
+ cur_prompt = robot_prompt.format(robot=cur_content)
226
+ else:
227
+ raise RuntimeError
228
+ total_prompt += cur_prompt
229
+ total_prompt = total_prompt + cur_query_prompt.format(user=prompt)
230
+ return total_prompt
231
+
232
+
233
+ def main():
234
+ st.title('internlm2_5-7b-chat-assistant')
235
+
236
+ # torch.cuda.empty_cache()
237
+ print('load model begin.')
238
+ model, tokenizer = load_model()
239
+ print('load model end.')
240
+
241
+ generation_config = prepare_generation_config()
242
+
243
+ # Initialize chat history
244
+ if 'messages' not in st.session_state:
245
+ st.session_state.messages = []
246
+
247
+ # Display chat messages from history on app rerun
248
+ for message in st.session_state.messages:
249
+ with st.chat_message(message['role'], avatar=message.get('avatar')):
250
+ st.markdown(message['content'])
251
+
252
+ # Accept user input
253
+ if prompt := st.chat_input('What is up?'):
254
+ # Display user message in chat message container
255
+
256
+ with st.chat_message('user', avatar='user'):
257
+
258
+ st.markdown(prompt)
259
+ real_prompt = combine_history(prompt)
260
+ # Add user message to chat history
261
+ st.session_state.messages.append({
262
+ 'role': 'user',
263
+ 'content': prompt,
264
+ 'avatar': 'user'
265
+ })
266
+
267
+ with st.chat_message('robot', avatar='assistant'):
268
+
269
+ message_placeholder = st.empty()
270
+ for cur_response in generate_interactive(
271
+ model=model,
272
+ tokenizer=tokenizer,
273
+ prompt=real_prompt,
274
+ additional_eos_token_id=92542,
275
+ device='cuda:0',
276
+ **asdict(generation_config),
277
+ ):
278
+ # Display robot response in chat message container
279
+ message_placeholder.markdown(cur_response + '▌')
280
+ message_placeholder.markdown(cur_response)
281
+ # Add robot response to chat history
282
+ st.session_state.messages.append({
283
+ 'role': 'robot',
284
+ 'content': cur_response, # pylint: disable=undefined-loop-variable
285
+ 'avatar': 'assistant',
286
+ })
287
+ torch.cuda.empty_cache()
288
+
289
+
290
+ if __name__ == '__main__':
291
+ main()
292
+
data/.ipynb_checkpoints/change_script-checkpoint.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tqdm import tqdm
4
+
5
+ def process_line(line, old_text, new_text):
6
+ # 解析 JSON 行
7
+ data = json.loads(line)
8
+
9
+ # 递归函数来处理嵌套的字典和列表
10
+ def replace_text(obj):
11
+ if isinstance(obj, dict):
12
+ return {k: replace_text(v) for k, v in obj.items()}
13
+ elif isinstance(obj, list):
14
+ return [replace_text(item) for item in obj]
15
+ elif isinstance(obj, str):
16
+ return obj.replace(old_text, new_text)
17
+ else:
18
+ return obj
19
+
20
+ # 处理整个 JSON 对象
21
+ processed_data = replace_text(data)
22
+
23
+ # 将处理后的对象转回 JSON 字符串
24
+ return json.dumps(processed_data, ensure_ascii=False)
25
+
26
+ def main(input_file, output_file, old_text, new_text):
27
+ with open(input_file, 'r', encoding='utf-8') as infile, \
28
+ open(output_file, 'w', encoding='utf-8') as outfile:
29
+
30
+ # 计算总行数用于进度条
31
+ total_lines = sum(1 for _ in infile)
32
+ infile.seek(0) # 重置文件指针到开头
33
+
34
+ # 使用 tqdm 创建进度条
35
+ for line in tqdm(infile, total=total_lines, desc="Processing"):
36
+ processed_line = process_line(line.strip(), old_text, new_text)
37
+ outfile.write(processed_line + '\n')
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser(description="Replace text in a JSONL file.")
41
+ parser.add_argument("input_file", help="Input JSONL file to process")
42
+ parser.add_argument("output_file", help="Output file for processed JSONL")
43
+ parser.add_argument("--old_text", default="尖米", help="Text to be replaced")
44
+ parser.add_argument("--new_text", default="简夕", help="Text to replace with")
45
+ args = parser.parse_args()
46
+
47
+ main(args.input_file, args.output_file, args.old_text, args.new_text)
48
+
data/assistant_Tuner.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/assistant_Tuner_change.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/change_script.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tqdm import tqdm
4
+
5
+ def process_line(line, old_text, new_text):
6
+ # 解析 JSON 行
7
+ data = json.loads(line)
8
+
9
+ # 递归函数来处理嵌套的字典和列表
10
+ def replace_text(obj):
11
+ if isinstance(obj, dict):
12
+ return {k: replace_text(v) for k, v in obj.items()}
13
+ elif isinstance(obj, list):
14
+ return [replace_text(item) for item in obj]
15
+ elif isinstance(obj, str):
16
+ return obj.replace(old_text, new_text)
17
+ else:
18
+ return obj
19
+
20
+ # 处理整个 JSON 对象
21
+ processed_data = replace_text(data)
22
+
23
+ # 将处理后的对象转回 JSON 字符串
24
+ return json.dumps(processed_data, ensure_ascii=False)
25
+
26
+ def main(input_file, output_file, old_text, new_text):
27
+ with open(input_file, 'r', encoding='utf-8') as infile, \
28
+ open(output_file, 'w', encoding='utf-8') as outfile:
29
+
30
+ # 计算总行数用于进度条
31
+ total_lines = sum(1 for _ in infile)
32
+ infile.seek(0) # 重置文件指针到开头
33
+
34
+ # 使用 tqdm 创建进度条
35
+ for line in tqdm(infile, total=total_lines, desc="Processing"):
36
+ processed_line = process_line(line.strip(), old_text, new_text)
37
+ outfile.write(processed_line + '\n')
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser(description="Replace text in a JSONL file.")
41
+ parser.add_argument("input_file", help="Input JSONL file to process")
42
+ parser.add_argument("output_file", help="Output file for processed JSONL")
43
+ parser.add_argument("--old_text", default="尖米", help="Text to be replaced")
44
+ parser.add_argument("--new_text", default="简夕", help="Text to replace with")
45
+ args = parser.parse_args()
46
+
47
+ main(args.input_file, args.output_file, args.old_text, args.new_text)
48
+
internlm2_5_chat_7b_qlora_alpaca_e3_copy.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM = 'xtuner.utils.SYSTEM_TEMPLATE.alpaca'
2
+ accumulative_counts = 1
3
+ alpaca_en = dict(
4
+ dataset=dict(
5
+ data_files=dict(
6
+ train='/root/finetune/data/assistant_Tuner_change.jsonl'),
7
+ path='json',
8
+ type='datasets.load_dataset'),
9
+ dataset_map_fn=None,
10
+ max_length=2048,
11
+ pack_to_max_length=True,
12
+ remove_unused_columns=True,
13
+ shuffle_before_pack=True,
14
+ template_map_fn=dict(
15
+ template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat',
16
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
17
+ tokenizer=dict(
18
+ padding_side='right',
19
+ pretrained_model_name_or_path=
20
+ '/root/finetune/models/internlm2_5-7b-chat',
21
+ trust_remote_code=True,
22
+ type='transformers.AutoTokenizer.from_pretrained'),
23
+ type='xtuner.dataset.process_hf_dataset',
24
+ use_varlen_attn=False)
25
+ alpaca_en_path = '/root/finetune/data/assistant_Tuner_change.jsonl'
26
+ batch_size = 1
27
+ betas = (
28
+ 0.9,
29
+ 0.999,
30
+ )
31
+ custom_hooks = [
32
+ dict(
33
+ tokenizer=dict(
34
+ padding_side='right',
35
+ pretrained_model_name_or_path=
36
+ '/root/finetune/models/internlm2_5-7b-chat',
37
+ trust_remote_code=True,
38
+ type='transformers.AutoTokenizer.from_pretrained'),
39
+ type='xtuner.engine.hooks.DatasetInfoHook'),
40
+ dict(
41
+ evaluation_inputs=[
42
+ '请介绍一下你自己',
43
+ 'Please introduce yourself',
44
+ ],
45
+ every_n_iters=500,
46
+ prompt_template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat',
47
+ system='xtuner.utils.SYSTEM_TEMPLATE.alpaca',
48
+ tokenizer=dict(
49
+ padding_side='right',
50
+ pretrained_model_name_or_path=
51
+ '/root/finetune/models/internlm2_5-7b-chat',
52
+ trust_remote_code=True,
53
+ type='transformers.AutoTokenizer.from_pretrained'),
54
+ type='xtuner.engine.hooks.EvaluateChatHook'),
55
+ ]
56
+ dataloader_num_workers = 0
57
+ default_hooks = dict(
58
+ checkpoint=dict(
59
+ by_epoch=False,
60
+ interval=500,
61
+ max_keep_ckpts=2,
62
+ type='mmengine.hooks.CheckpointHook'),
63
+ logger=dict(
64
+ interval=10,
65
+ log_metric_by_epoch=False,
66
+ type='mmengine.hooks.LoggerHook'),
67
+ param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'),
68
+ sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'),
69
+ timer=dict(type='mmengine.hooks.IterTimerHook'))
70
+ env_cfg = dict(
71
+ cudnn_benchmark=False,
72
+ dist_cfg=dict(backend='nccl'),
73
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
74
+ evaluation_freq = 500
75
+ evaluation_inputs = [
76
+ '请介绍一下你自己',
77
+ 'Please introduce yourself',
78
+ ]
79
+ launcher = 'none'
80
+ load_from = None
81
+ log_level = 'INFO'
82
+ log_processor = dict(by_epoch=False)
83
+ lr = 0.0002
84
+ max_epochs = 3
85
+ max_length = 2048
86
+ max_norm = 1
87
+ model = dict(
88
+ llm=dict(
89
+ pretrained_model_name_or_path=
90
+ '/root/finetune/models/internlm2_5-7b-chat',
91
+ quantization_config=dict(
92
+ bnb_4bit_compute_dtype='torch.float16',
93
+ bnb_4bit_quant_type='nf4',
94
+ bnb_4bit_use_double_quant=True,
95
+ llm_int8_has_fp16_weight=False,
96
+ llm_int8_threshold=6.0,
97
+ load_in_4bit=True,
98
+ load_in_8bit=False,
99
+ type='transformers.BitsAndBytesConfig'),
100
+ torch_dtype='torch.float16',
101
+ trust_remote_code=True,
102
+ type='transformers.AutoModelForCausalLM.from_pretrained'),
103
+ lora=dict(
104
+ bias='none',
105
+ lora_alpha=16,
106
+ lora_dropout=0.1,
107
+ r=64,
108
+ task_type='CAUSAL_LM',
109
+ type='peft.LoraConfig'),
110
+ type='xtuner.model.SupervisedFinetune',
111
+ use_varlen_attn=False)
112
+ optim_type = 'torch.optim.AdamW'
113
+ optim_wrapper = dict(
114
+ optimizer=dict(
115
+ betas=(
116
+ 0.9,
117
+ 0.999,
118
+ ),
119
+ lr=0.0002,
120
+ type='torch.optim.AdamW',
121
+ weight_decay=0),
122
+ type='DeepSpeedOptimWrapper')
123
+ pack_to_max_length = True
124
+ param_scheduler = [
125
+ dict(
126
+ begin=0,
127
+ by_epoch=True,
128
+ convert_to_iter_based=True,
129
+ end=0.09,
130
+ start_factor=1e-05,
131
+ type='mmengine.optim.LinearLR'),
132
+ dict(
133
+ begin=0.09,
134
+ by_epoch=True,
135
+ convert_to_iter_based=True,
136
+ end=3,
137
+ eta_min=0.0,
138
+ type='mmengine.optim.CosineAnnealingLR'),
139
+ ]
140
+ pretrained_model_name_or_path = '/root/finetune/models/internlm2_5-7b-chat'
141
+ prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat'
142
+ randomness = dict(deterministic=False, seed=None)
143
+ resume = False
144
+ runner_type = 'FlexibleRunner'
145
+ sampler = 'mmengine.dataset.DefaultSampler'
146
+ save_steps = 500
147
+ save_total_limit = 2
148
+ sequence_parallel_size = 1
149
+ strategy = dict(
150
+ config=dict(
151
+ bf16=dict(enabled=True),
152
+ fp16=dict(enabled=False, initial_scale_power=16),
153
+ gradient_accumulation_steps='auto',
154
+ gradient_clipping='auto',
155
+ train_micro_batch_size_per_gpu='auto',
156
+ zero_allow_untested_optimizer=True,
157
+ zero_force_ds_cpu_optimizer=False,
158
+ zero_optimization=dict(overlap_comm=True, stage=2)),
159
+ exclude_frozen_parameters=True,
160
+ gradient_accumulation_steps=1,
161
+ gradient_clipping=1,
162
+ sequence_parallel_size=1,
163
+ train_micro_batch_size_per_gpu=1,
164
+ type='xtuner.engine.DeepSpeedStrategy')
165
+ tokenizer = dict(
166
+ padding_side='right',
167
+ pretrained_model_name_or_path='/root/finetune/models/internlm2_5-7b-chat',
168
+ trust_remote_code=True,
169
+ type='transformers.AutoTokenizer.from_pretrained')
170
+ train_cfg = dict(max_epochs=3, type='xtuner.engine.runner.TrainLoop')
171
+ train_dataloader = dict(
172
+ batch_size=1,
173
+ collate_fn=dict(
174
+ type='xtuner.dataset.collate_fns.default_collate_fn',
175
+ use_varlen_attn=False),
176
+ dataset=dict(
177
+ dataset=dict(
178
+ data_files=dict(
179
+ train='/root/finetune/data/assistant_Tuner_change.jsonl'),
180
+ path='json',
181
+ type='datasets.load_dataset'),
182
+ dataset_map_fn=None,
183
+ max_length=2048,
184
+ pack_to_max_length=True,
185
+ remove_unused_columns=True,
186
+ shuffle_before_pack=True,
187
+ template_map_fn=dict(
188
+ template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat',
189
+ type='xtuner.dataset.map_fns.template_map_fn_factory'),
190
+ tokenizer=dict(
191
+ padding_side='right',
192
+ pretrained_model_name_or_path=
193
+ '/root/finetune/models/internlm2_5-7b-chat',
194
+ trust_remote_code=True,
195
+ type='transformers.AutoTokenizer.from_pretrained'),
196
+ type='xtuner.dataset.process_hf_dataset',
197
+ use_varlen_attn=False),
198
+ num_workers=0,
199
+ sampler=dict(shuffle=True, type='mmengine.dataset.DefaultSampler'))
200
+ use_varlen_attn = False
201
+ visualizer = None
202
+ warmup_ratio = 0.03
203
+ weight_decay = 0
204
+ work_dir = './work_dirs/assistTuner'