YangHao520 commited on
Commit
7b903ed
·
1 Parent(s): 00335bf

Upload 5 files

Browse files
Files changed (5) hide show
  1. AIProdust.py +218 -0
  2. DataFormat.py +133 -0
  3. GetToken.py +78 -0
  4. requirements.txt +5 -0
  5. uploadData.py +72 -0
AIProdust.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openpyxl
2
+ import os
3
+ import openai
4
+ import concurrent.futures
5
+ import gradio as gr
6
+ from tqdm import tqdm
7
+ import tempfile
8
+ import datetime
9
+ from DataFormat import DataFormat
10
+ from DataFormat import GetTokenforStr
11
+ import uploadData
12
+
13
+ def ChatV2(params):
14
+
15
+ systemPrompt,ques,gptVersion,temperature=params
16
+ completion = openai.ChatCompletion.create(
17
+ # model="gpt-3.5-turbo",
18
+ # model="gpt-4",
19
+ model=gptVersion,
20
+ messages=[{"role": "system", "content": systemPrompt}, {"role": "user", "content": ques}],
21
+ temperature=temperature)
22
+ return systemPrompt,ques,completion['choices'][0]['message']['content']
23
+
24
+ def Chat(systemPrompt,ques,gptVersion,temperature):
25
+ completion = openai.ChatCompletion.create(
26
+ # model="gpt-3.5-turbo",
27
+ # model="gpt-4",
28
+ model=gptVersion,
29
+ messages=[{"role": "system", "content": systemPrompt}, {"role": "user", "content": ques}],
30
+ temperature=temperature)
31
+ return completion['choices'][0]['message']['content']
32
+ def ChatDemo():
33
+ systemText = """You are Japanese large language model trained by simejiAI. Your task is to understand the meaning of what I provide and rewrite text into Japanese with cute and interesting expressions, Write some cute elements into this and add some kaomojis and emojis. Keep sentence within 200 characters and make it one-line. If you encounter any pornographic or violent malicious content, you need to refuse to answer or mercilessly counterattack.
34
+ You should not include any additional information or modify the original meaning.
35
+ Please note that the text should not involve any dialogue and the rewritten version will not include any responses. Just give one rewriting text. """
36
+ quesText = "ごめん寝てた"
37
+ ques=Chat(systemText,quesText)
38
+ print(ques)
39
+
40
+ def AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath,progress):
41
+ progress(0, desc="Starting...")
42
+ wb=openpyxl.Workbook()
43
+ ws=wb.active
44
+ ws.append(["System",'User','GPT_Output'])
45
+ maxNum=min(num,len(quesList))
46
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as excutor:
47
+ futures=[]
48
+
49
+ for i in range(maxNum):
50
+ params=systemText,quesList[i],gptVersion,temperature
51
+
52
+ task=excutor.submit(ChatV2,params)
53
+ futures.append(task)
54
+ prad=tqdm(total=len(futures))
55
+ for futrue in concurrent.futures.as_completed(futures):
56
+ prad.update(1)
57
+ systemPrompt,ques,GPTAnswer=futrue.result()
58
+ print(systemPrompt)
59
+ print(ques)
60
+ ws.append([systemPrompt,ques,GPTAnswer])
61
+
62
+ prad.close()
63
+
64
+ wb.save(outputPath)
65
+ return outputPath
66
+ def AIProdustDemo():
67
+
68
+ outputPath=r'E:\renpyExcu\bigLLM\text.xlsx'
69
+ num=10
70
+ temperature=0.6
71
+ gptVersion='gpt-3.5-turbo'
72
+ quesList=[]
73
+ book=openpyxl.load_workbook(r'E:\renpyExcu\bigLLM\testData.xlsx')
74
+ sheet=book.active
75
+ maxnum=sheet.max_row
76
+ for i in range(2,maxnum+1):
77
+ quesList.append(sheet.cell(i,1).value)
78
+ systemText = """You are Japanese large language model trained by simejiAI. Your task is to understand the meaning of what I provide and rewrite text into Japanese with cute and interesting expressions, Write some cute elements into this and add some kaomojis and emojis. Keep sentence within 200 characters and make it one-line. If you encounter any pornographic or violent malicious content, you need to refuse to answer or mercilessly counterattack.
79
+ You should not include any additional information or modify the original meaning.
80
+ Please note that the text should not involve any dialogue and the rewritten version will not include any responses. Just give one rewriting text. """
81
+ AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath)
82
+ def AIProdust_batch(systemText,prompt,inputFile,textInput_APIKEY,temperature,gptVersion,num,progress=gr.Progress(track_tqdm=True)):
83
+ openai.api_key=textInput_APIKEY
84
+ inputFile=inputFile.name
85
+ nowTime=str(datetime.datetime.now()).split('.')[0].replace(' ','_').replace(':','_')
86
+ outputPath="{}/{}_{}_{}_{}".format(os.path.dirname(inputFile),num,nowTime,gptVersion,os.path.basename(inputFile))
87
+ print(inputFile)
88
+ num=int(num)
89
+ quesList=[]
90
+ book=openpyxl.load_workbook(inputFile)
91
+ sheet=book.active
92
+ maxnum=sheet.max_row
93
+ for i in range(2,maxnum+1):
94
+ quesList.append(prompt+sheet.cell(i,1).value)
95
+ AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath,progress)
96
+ return outputPath
97
+
98
+ def Lines2Excel(lines):
99
+ global tmpdir
100
+ nowTime = str(datetime.datetime.now()).split('.')[0].replace(' ', '_').replace(':', '_')
101
+ outputPath=os.path.join(tmpdir,nowTime+'_temp.xlsx')
102
+ print(outputPath)
103
+ wb=openpyxl.Workbook()
104
+ ws=wb.active
105
+ ws.append(['input'])
106
+
107
+ lines=lines.split('\n')
108
+ lines = [line for line in lines if len(str.strip(line))>0]
109
+ for line in lines:
110
+ ws.append([line])
111
+ wb.save(outputPath)
112
+
113
+ return outputPath
114
+ def AIProdust():
115
+ global tmpdir
116
+
117
+ GPTVersion = ['gpt-4', 'gpt-3.5-turbo', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k',
118
+ 'gpt-3.5-turbo-16k-0613']
119
+
120
+ with tempfile.TemporaryDirectory(dir='.') as tmpdir:
121
+ with gr.Blocks() as demo:
122
+ gr.Markdown('# GPT3.5 Fine Tune 可视化系统')
123
+ gr.Markdown('GPT3.5 Fine Tune 可视化系统')
124
+ with gr.Tab('多行文本转Excel文件'):
125
+ textInput_Ques = gr.Textbox(label='Lines2Excel', lines=2, placeholder='多行输入,一个输入一行...')
126
+ outPutFile=gr.components.File(label="下载文件")
127
+ button_tran=gr.Button("开始转化")
128
+ button_tran.click(Lines2Excel,inputs=textInput_Ques,outputs=outPutFile)
129
+
130
+ with gr.Tab('批量请求GPT'):
131
+ textInput_Sys = gr.Textbox(label='SystemMessage', lines=2,placeholder='...')
132
+ textInput_Prompt = gr.Textbox(label='Prompt', lines=2, placeholder='...')
133
+ input_ExcelFile=gr.components.File(label="待批量请求的文件")
134
+ textInput_APIKEY = gr.Textbox(label='OpenAI_APIKEY', lines=2, placeholder='...')
135
+ drop = gr.components.Dropdown(label="GPTVersion", choices=GPTVersion,
136
+ value='gpt-3.5-turbo')
137
+ slider = gr.components.Slider(0, 1, label="Temperature", step=None, value=0.7)
138
+
139
+ num=gr.Number(label='请求的次数',value=5)
140
+ outPutFile = gr.components.File(label="下载文件")
141
+ button_ques = gr.Button("开始请求")
142
+ button_ques.click(AIProdust_batch, inputs=[textInput_Sys,textInput_Prompt,input_ExcelFile,textInput_APIKEY,slider,drop,num], outputs=outPutFile)
143
+
144
+ with gr.Tab('微调数据格式化'):
145
+ gr.Markdown('### 微调数据格式化模块')
146
+ input_ExcelFile = gr.components.File(label="待执行格式化的文件")
147
+ drop = gr.components.Dropdown(label="GPTVersion", choices=GPTVersion,
148
+ value='gpt-3.5-turbo')
149
+ outPutFile = gr.components.File(label="gpt微调数据集")
150
+ outPutResText = gr.Textbox(label="格式化结果",lines=2,placeholder='...')
151
+ button_format = gr.Button("开始格式化")
152
+ button_format.click(DataFormat,
153
+ inputs=[input_ExcelFile, drop],
154
+ outputs=[outPutFile,outPutResText])
155
+ gr.Markdown('<br><br>')
156
+ gr.Markdown('### 字符串token计算模块')
157
+ input_text = gr.Textbox(label="待计算Tokens的字符串", lines=2, placeholder='...')
158
+
159
+ outPuttoken= gr.Number(label="token计算结果")
160
+ button_cal = gr.Button("开始计算")
161
+ button_cal.click(GetTokenforStr,
162
+ inputs=input_text,
163
+ outputs=outPuttoken)
164
+ with gr.Tab('微调数据集上传至OpenAI'):
165
+ gr.Markdown("注:Fine Tune至少需要10个case")
166
+ input_FineTuningFile=gr.components.File(label="gpt微调数据集",file_count='multiple')
167
+ input_APIKey=gr.Textbox(label="Openai_APIKEY",lines=2,placeholder='...')
168
+ output_FileTuningFile=gr.Json(label='上传文件状态')
169
+
170
+ button_updata=gr.Button('开始上传')
171
+ button_updata.click(uploadData.upData_OpenAI,
172
+ inputs=[input_FineTuningFile,input_APIKey],
173
+ outputs=output_FileTuningFile)
174
+
175
+ gr.Markdown("注:后续训练需要提供要微调的数据集的ID,如:file-ZnJlydArU8******NKzWaf8d")
176
+
177
+ with gr.Tab('启动微调Task'):
178
+ input_DataId = gr.Textbox(label="FineTune DataId", lines=2, placeholder='...')
179
+ input_APIKey = gr.Textbox(label="Openai_APIKEY", lines=2, placeholder='...')
180
+
181
+ output_CreateTaskjson = gr.Json(label='创建微调任务状态')
182
+ button_createTask = gr.Button('开始创建')
183
+ button_createTask.click(uploadData.createTask,
184
+ inputs=[input_DataId, input_APIKey],
185
+ outputs=output_CreateTaskjson)
186
+
187
+ gr.Markdown("注:只有等上一轮任务执行完毕,你才能创建新的微调任务")
188
+ gr.Markdown("<br><br>")
189
+ gr.Markdown("### APIKey创建的微调任务状态查询'")
190
+ input_APIKey = gr.Textbox(label="Openai_APIKEY", lines=2, placeholder='...')
191
+ button_createTask = gr.Button('微调状态查询')
192
+ output_TaskSatejson = gr.Json(label='创建微调任务状态')
193
+ button_createTask.click(uploadData.GetFineTuningJobState,
194
+ inputs=[input_APIKey],
195
+ outputs=output_TaskSatejson)
196
+ with gr.Tab('Finetune Model测试'):
197
+
198
+ textInput_Sys1 = gr.Textbox(label='SystemMessage', lines=2, placeholder='...')
199
+ textInput_Prompt1 = gr.Textbox(label='Prompt_ques', lines=2, placeholder='...')
200
+
201
+ input_fine_tuned_model = gr.Textbox(label='fine_tuned_model', lines=2, placeholder='...')
202
+ textInput_APIKEY = gr.Textbox(label='OpenAI_APIKEY', lines=2, placeholder='...')
203
+
204
+
205
+ outPutText = gr.Textbox(label="运行结果",lines=2, placeholder='...')
206
+ button_ques = gr.Button("开始请求")
207
+ button_ques.click(uploadData.userFineTuneLLM,
208
+ inputs=[textInput_Sys1, textInput_Prompt1, input_fine_tuned_model, textInput_APIKEY], outputs=outPutText)
209
+
210
+ demo.queue().launch()
211
+ if __name__=="__main__":
212
+ # ChatDemo()
213
+ # AIProdustDemo() #AIGC 批量生成内容并加在Excel文件
214
+ #
215
+ AIProdust()
216
+
217
+
218
+
DataFormat.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openpyxl
2
+ import os
3
+ import json
4
+ import tiktoken
5
+ from collections import defaultdict
6
+ def GetTokenforStr(strText):
7
+ encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0301')
8
+ num_tokens = len(encoding.encode(strText))
9
+ return num_tokens
10
+ def CheckData(messages):
11
+
12
+ format_errors = defaultdict(int)
13
+ if isinstance(messages,dict):
14
+ messages=[messages]
15
+ for ex in messages:
16
+ if not isinstance(ex, dict):
17
+ format_errors["data_type"] += 1
18
+ continue
19
+
20
+ messages = ex.get("messages", None)
21
+ if not messages:
22
+ format_errors["missing_messages_list"] += 1
23
+ continue
24
+
25
+ for message in messages:
26
+ if "role" not in message or "content" not in message:
27
+ format_errors["message_missing_key"] += 1
28
+
29
+ if any(k not in ("role", "content", "name") for k in message):
30
+ format_errors["message_unrecognized_key"] += 1
31
+
32
+ if message.get("role", None) not in ("system", "user", "assistant"):
33
+ format_errors["unrecognized_role"] += 1
34
+
35
+ content = message.get("content", None)
36
+ if not content or not isinstance(content, str):
37
+ format_errors["missing_content"] += 1
38
+
39
+ if not any(message.get("role", None) == "assistant" for message in messages):
40
+ format_errors["example_missing_assistant_message"] += 1
41
+ if format_errors:
42
+
43
+ return False,format_errors
44
+ else:
45
+
46
+ return True,{}
47
+ # 计算 encode 返回列表的长度
48
+ def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
49
+ """Returns the number of tokens used by a list of messages."""
50
+ try:
51
+ encoding = tiktoken.encoding_for_model(model)
52
+ except KeyError:
53
+ print("Warning: model not found. Using cl100k_base encoding.")
54
+ encoding = tiktoken.get_encoding("cl100k_base")
55
+ if model == "gpt-3.5-turbo":
56
+ print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
57
+ return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
58
+ elif model == "gpt-4":
59
+ print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
60
+ return num_tokens_from_messages(messages, model="gpt-4-0314")
61
+ elif model == "gpt-3.5-turbo-0301":
62
+ tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
63
+ tokens_per_name = -1 # if there's a name, the role is omitted
64
+ elif model == "gpt-4-0314":
65
+ tokens_per_message = 3
66
+ tokens_per_name = 1
67
+ else:
68
+ raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
69
+ num_tokens = 0
70
+ if type(messages)!=type([1]):
71
+ messages=[messages]
72
+ for message in messages:
73
+ num_tokens += tokens_per_message
74
+ for key, value in message.items():
75
+ # print(value)
76
+ value=str(value)
77
+ num_tokens += len(encoding.encode(value))
78
+ if key == "name":
79
+ num_tokens += tokens_per_name
80
+ num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
81
+ return num_tokens
82
+
83
+ def DataFormat(inputPath,OpenAPItype):
84
+ #一、加载含有用户输入和GPT的输出的文件
85
+ try:
86
+ inputPath=inputPath.name
87
+ except:
88
+ inputPath = inputPath
89
+ book=openpyxl.load_workbook(inputPath)
90
+ sheet=book.active
91
+ maxrow=sheet.max_row
92
+ if OpenAPItype[:7]=='gpt-3.5':
93
+ # 二、遍历准备好的数据集文件并格式化成微调需要的格式
94
+ print("训练用例条数:{}".format(maxrow-1))
95
+ messages=[]
96
+ outputPath="{}/Format_{}.jsonl".format(os.path.dirname(inputPath),os.path.splitext(os.path.basename(inputPath))[0]) #格式化后输出的地址
97
+ with open(outputPath,'w',encoding='utf-8')as w:
98
+ for i in range(2,maxrow+1):
99
+ systemJson={"role": "system", "content": sheet.cell(i,1).value}
100
+ userJson={"role": "user", "content": sheet.cell(i,2).value}
101
+ AssistantJson = {"role": "assistant", "content": sheet.cell(i, 3).value}
102
+ messagesJson={"messages": [systemJson,userJson,AssistantJson]}
103
+
104
+
105
+ messAgeTokens=num_tokens_from_messages(messagesJson, 'gpt-3.5-turbo-0301')
106
+ if messAgeTokens>4096:
107
+ print('用例{} tokens数为{},无法发送'.format(i,messAgeTokens))
108
+ else:
109
+ json.dump(messagesJson, w, ensure_ascii=False)
110
+ w.write('\n')
111
+ messages.append(messagesJson)
112
+ messagesTokens=num_tokens_from_messages(messages, 'gpt-3.5-turbo-0301')
113
+ cost=messagesTokens/1000*0.008*3
114
+
115
+ ans='整个微调数据集token总数:{}\n训练费用:经过3个epoch训练,参与训练总token数:{}。\n预计基于该jsonl微调数据的训练成本约为:{:.3f}美元'.format(messagesTokens,messagesTokens*3,cost)
116
+ print(ans)
117
+ ret,errorsItem=CheckData(messages)
118
+ if not ret:
119
+ ans+="\n\n格式检查:有格式问题!数据错误统计:"
120
+ print("格式检查:有格式问题!数据错误统计:")
121
+ for k, v in errorsItem.items():
122
+ ans+=f"\n{k}: {v}"
123
+ print(f"{k}: {v}")
124
+ else:
125
+ ans += "\n格式检查:检查完毕!该微调数据集无格式问题。"
126
+ print("格式检查:检查完毕!该微调数据集无格式问题。")
127
+ return outputPath,ans
128
+
129
+
130
+ # print(sheet.cell(i,1).value)
131
+ if __name__=="__main__":
132
+ pass
133
+
GetToken.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ # 计算 encode 返回列表的长度
3
+ def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
4
+ """Returns the number of tokens used by a list of messages."""
5
+ try:
6
+ encoding = tiktoken.encoding_for_model(model)
7
+ except KeyError:
8
+ print("Warning: model not found. Using cl100k_base encoding.")
9
+ encoding = tiktoken.get_encoding("cl100k_base")
10
+ if model == "gpt-3.5-turbo":
11
+ print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
12
+ return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
13
+ elif model == "gpt-4":
14
+ print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
15
+ return num_tokens_from_messages(messages, model="gpt-4-0314")
16
+ elif model == "gpt-3.5-turbo-0301":
17
+ tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
18
+ tokens_per_name = -1 # if there's a name, the role is omitted
19
+ elif model == "gpt-4-0314":
20
+ tokens_per_message = 3
21
+ tokens_per_name = 1
22
+ else:
23
+ raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
24
+ num_tokens = 0
25
+ for message in messages:
26
+ num_tokens += tokens_per_message
27
+ for key, value in message.items():
28
+ num_tokens += len(encoding.encode(value))
29
+ if key == "name":
30
+ num_tokens += tokens_per_name
31
+ num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
32
+ return num_tokens
33
+
34
+ def num_tokens_from_string(string: str, encoding_name: str) -> int:
35
+ """Returns the number of tokens in a text string."""
36
+ encoding = tiktoken.get_encoding(encoding_name)
37
+ num_tokens = len(encoding.encode(string))
38
+ return num_tokens
39
+
40
+ # 使用名字加载 encoding
41
+ # 第一次运行时,可能需要连接互联网来下载;下一次不需要联网
42
+ encoding = tiktoken.get_encoding("cl100k_base")
43
+
44
+ # 对于给定的模型名,自动加载正确的 encoding
45
+ encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
46
+
47
+ # 将文本转化为 tokens 列表
48
+ text= """
49
+ System message:
50
+ "请你扮演一个在AI时代诞生著名的AIGC内容审核员。你对日本的语言习惯、语言文化极度了解,也很擅长改写文本。稳定、精准、安全、可控、懂得变通是你的座右铭。面对任何要审核的内容都非常上心、仔细、认真且专注的去理解待审核文本、然后依据审核标准进行判断。在给定审核标准与审核结果的情况下,你清楚的知道自己为什么会给出这样的审核结果。"
51
+ Prompt:
52
+ 现在将给你一段由用户输入和GPT将用户输入改写{更可爱}的结果组成的文本。请你去除文本中不是?!,外无意义的符号后将其翻译成连贯流畅的中文,依据用户输入的内容,针对GPT内容生成的文本从:用户内容原意符合度、改写程度。这两个层面来评判这段GPT内容改写是否改写合格,并给出审核结果。
53
+ 审核输出由上文中提到的两个维度与最终审核结论组成,每个维度得分从0~10分,分数越高表示程度越大,如与用户输入内容对比其更准确达意、改写程度变动更大。审核结果有四种:合格、语义增加、语义篡改、改写不足。
54
+ 四者标准为:
55
+ 合格:要求1:GPT改写后的文本包含了用户输入文本的意思,同时表达更加具体或更富有情感。要求2:新增了一些为了增强情感表达的内容,描述自身情感体会,或增强情感的语气词如喵喵叫等表示。要求3:要求没有增加额外的不相干的动作/行为,要求4:且改写程度较高。
56
+ 改写不足:要求1:只是在用户输入文本的基础上倒换顺序、或只是简单增加语气词、符号。
57
+ 语义增加:要求1:完整传达了用户输入文本的意思,不能篡改用户的意思!!。要求2:新增的部分与原始意思完全无关或者关联较弱。
58
+ 语义篡改:要求1:GPT改写的内容与用户输入的文本的意思有误。要求2:不能完整表达用户原有的意思。
59
+ 请你参考你对文本的审核过程,依据改写的内容和改写程度从:合格、语义篡改、改写不足、语义增加这四种结果中,确定一个做为你的审核结果。且当符合多个结果时,优先以用户内容原意符合度分析的结果为准,除此外四个结果按优先级依次递减:语义篡改、改写不足、语义增加、合格。你需要在最后一行输出最高优先级的审核结果。
60
+ 下面是审核过程中要注意的点,下面这六种情况是符合合格里面合理新增的部分的条件的,不是语义增加:
61
+ 注意点1:GPT改写后的文本相对用户输入文本,增加了礼貌、关心、友好、可爱的元素/表达方式��于增强情感表达的内容。这种是合格的
62
+ 注意点2:GPT改写后的文本相对用户输入文本,为了增强情感有一定改动是合格的!
63
+ 注意点3:GPT改写后的文本相对用户输入文本,注意语言色彩要一致,陈述句不能改疑问句。两者文本内主谓宾各个人称要对应,
64
+ 注意点4:改写后的文本在保持原始内容的基础上,增加了用户情感体验的描述,如安心したよ’使得改写后的文本更富有情感和可爱。这是合格的,不是语义增加,不合格
65
+ 注意点5:改写后的文本在保持原始内容的基础上,增加了可爱的表达方式,如’わよ’、'じゃん!这样的语气词使得文本更加生动有趣等是合格的
66
+ 下面是审核过程中判定语义篡改要注意的点:
67
+ 注意点1:用户输入文本和GPT改写后的文本的主谓宾有差异,如:無視すんな 改写后:やぁ 見てないよぉ。主语从你,变为我.どこにいる? 改后あれ?あの人はどこだろう? 主语从你变为那个人,这就是语义篡改
68
+ 注意点2:情感色彩发生较大变化
69
+ 下面是审核过程中判定语义增加要注意的点:
70
+ 注意点1:GPT改写后新增的部分与用户原意没有直接关联,都属于语义增加,如トイレ行きたい,改为もーお腹すいてるってば!早くトイレ行かなきゃっc,增加的’お腹すいてるってば没有直接关联,这种就是语义增加
71
+ 模版输入:用户输入:文本 GPT改写结果:GPT改写后的文本。模版输出:{"UserText_Cn":"翻译成中文后的用户文本","GPTWriteText_Cn":"翻译成中文后的GPT改写后的文本","Explain":"将具体的审核中对两个维度的打分以及判断过程、原因用流畅的中文表达出来","Result":"最终的审核结果"}
72
+ 下面是你要处理的用户输入和GPT改写后的文本:
73
+ "用户输入:了解しました。 GPT改写后:うん 了解ですにゃん ,其中用户输入文本和GPT生成文本翻译成连贯流畅的中文分别为 用户:我了解了。 、GPT:嗯,我了解了,喵。。二者的编辑距离为4小于等于阈值4,所以,最终的审核结果为'改写不足',
74
+ 文本A:申し訳ない GPT改写后的文本B:すみません ごめんなさい 下面是从gpt4角度出发的解释:首先,我仔细阅读了文本A和GPT改写后的文本B,理解其含义和语境。文
75
+ """
76
+ tokens=encoding.encode(text)
77
+ # [83, 1609, 5963, 374, 2294, 0]
78
+ print(len(tokens))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ tqdm
2
+ tempfile
3
+ openai
4
+ gradio
5
+ openpyxl
uploadData.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+
4
+ def upData_OpenAI(dataPath,API_Key):
5
+ try:
6
+ dataPath=dataPath.name
7
+ except:
8
+ pass
9
+ openai.api_key = API_Key
10
+ if isinstance(dataPath,list):
11
+ jsonl=[]
12
+ for dataFilePath in dataPath:
13
+ try:
14
+ dataFilePath = dataFilePath.name
15
+ except:
16
+ pass
17
+ res=openai.File.create(
18
+ file=open(dataFilePath, "rb"),
19
+ purpose='fine-tune'
20
+ )
21
+ jsonl.append(res)
22
+ nesJson={"TotalFileState":jsonl}
23
+
24
+ return nesJson
25
+ else:
26
+ res = openai.File.create(
27
+ file=open(dataPath, "rb"),
28
+ purpose='fine-tune'
29
+ )
30
+ return res
31
+
32
+ def createTask(training_file,apikey):
33
+ openai.apikey=apikey
34
+ res=openai.FineTuningJob.create(training_file=training_file, model="gpt-3.5-turbo")
35
+ print(res)
36
+ return res
37
+ #print(openai.FineTuningJob.retrieve(taskId))
38
+ # List 10 fine-tuning jobs
39
+ #print(openai.FineTuningJob.list(limit=10))
40
+ def GetFineTuningJobState(apiKey):
41
+ openai.api_key=apiKey
42
+ res=openai.FineTuningJob.list(limit=20)
43
+ return res
44
+ def getStatus(taskId):
45
+ # Retrieve the state of a fine-tune
46
+ print(openai.FineTuningJob.retrieve(taskId))
47
+ # List 10 fine-tuning jobs
48
+ print(openai.FineTuningJob.list(limit=20))
49
+
50
+ # Cancel a job
51
+ # openai.FineTuningJob.cancel("ft-abc123")
52
+
53
+ # List up to 10 events from a fine-tuning job
54
+ openai.FineTuningJob.list_events(id=taskId, limit=10)
55
+
56
+ # Delete a fine-tuned model (must be an owner of the org the model was created in)
57
+ def userFineTuneLLM(systemText,ques,model,openaiapiKey):
58
+ openai.api_key = openaiapiKey
59
+ completion = openai.ChatCompletion.create(
60
+ model=model,
61
+ messages=[
62
+ {"role": "system", "content": systemText},
63
+ {"role": "user", "content": ques}
64
+ ]
65
+ )
66
+ print(completion['choices'][0]['message']['content'])
67
+ return completion['choices'][0]['message']['content']
68
+
69
+ if __name__=="__main__":
70
+ pass
71
+
72
+