Spaces:
Runtime error
Runtime error
Commit
·
7b903ed
1
Parent(s):
00335bf
Upload 5 files
Browse files- AIProdust.py +218 -0
- DataFormat.py +133 -0
- GetToken.py +78 -0
- requirements.txt +5 -0
- uploadData.py +72 -0
AIProdust.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openpyxl
|
2 |
+
import os
|
3 |
+
import openai
|
4 |
+
import concurrent.futures
|
5 |
+
import gradio as gr
|
6 |
+
from tqdm import tqdm
|
7 |
+
import tempfile
|
8 |
+
import datetime
|
9 |
+
from DataFormat import DataFormat
|
10 |
+
from DataFormat import GetTokenforStr
|
11 |
+
import uploadData
|
12 |
+
|
13 |
+
def ChatV2(params):
|
14 |
+
|
15 |
+
systemPrompt,ques,gptVersion,temperature=params
|
16 |
+
completion = openai.ChatCompletion.create(
|
17 |
+
# model="gpt-3.5-turbo",
|
18 |
+
# model="gpt-4",
|
19 |
+
model=gptVersion,
|
20 |
+
messages=[{"role": "system", "content": systemPrompt}, {"role": "user", "content": ques}],
|
21 |
+
temperature=temperature)
|
22 |
+
return systemPrompt,ques,completion['choices'][0]['message']['content']
|
23 |
+
|
24 |
+
def Chat(systemPrompt,ques,gptVersion,temperature):
|
25 |
+
completion = openai.ChatCompletion.create(
|
26 |
+
# model="gpt-3.5-turbo",
|
27 |
+
# model="gpt-4",
|
28 |
+
model=gptVersion,
|
29 |
+
messages=[{"role": "system", "content": systemPrompt}, {"role": "user", "content": ques}],
|
30 |
+
temperature=temperature)
|
31 |
+
return completion['choices'][0]['message']['content']
|
32 |
+
def ChatDemo():
|
33 |
+
systemText = """You are Japanese large language model trained by simejiAI. Your task is to understand the meaning of what I provide and rewrite text into Japanese with cute and interesting expressions, Write some cute elements into this and add some kaomojis and emojis. Keep sentence within 200 characters and make it one-line. If you encounter any pornographic or violent malicious content, you need to refuse to answer or mercilessly counterattack.
|
34 |
+
You should not include any additional information or modify the original meaning.
|
35 |
+
Please note that the text should not involve any dialogue and the rewritten version will not include any responses. Just give one rewriting text. """
|
36 |
+
quesText = "ごめん寝てた"
|
37 |
+
ques=Chat(systemText,quesText)
|
38 |
+
print(ques)
|
39 |
+
|
40 |
+
def AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath,progress):
|
41 |
+
progress(0, desc="Starting...")
|
42 |
+
wb=openpyxl.Workbook()
|
43 |
+
ws=wb.active
|
44 |
+
ws.append(["System",'User','GPT_Output'])
|
45 |
+
maxNum=min(num,len(quesList))
|
46 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as excutor:
|
47 |
+
futures=[]
|
48 |
+
|
49 |
+
for i in range(maxNum):
|
50 |
+
params=systemText,quesList[i],gptVersion,temperature
|
51 |
+
|
52 |
+
task=excutor.submit(ChatV2,params)
|
53 |
+
futures.append(task)
|
54 |
+
prad=tqdm(total=len(futures))
|
55 |
+
for futrue in concurrent.futures.as_completed(futures):
|
56 |
+
prad.update(1)
|
57 |
+
systemPrompt,ques,GPTAnswer=futrue.result()
|
58 |
+
print(systemPrompt)
|
59 |
+
print(ques)
|
60 |
+
ws.append([systemPrompt,ques,GPTAnswer])
|
61 |
+
|
62 |
+
prad.close()
|
63 |
+
|
64 |
+
wb.save(outputPath)
|
65 |
+
return outputPath
|
66 |
+
def AIProdustDemo():
|
67 |
+
|
68 |
+
outputPath=r'E:\renpyExcu\bigLLM\text.xlsx'
|
69 |
+
num=10
|
70 |
+
temperature=0.6
|
71 |
+
gptVersion='gpt-3.5-turbo'
|
72 |
+
quesList=[]
|
73 |
+
book=openpyxl.load_workbook(r'E:\renpyExcu\bigLLM\testData.xlsx')
|
74 |
+
sheet=book.active
|
75 |
+
maxnum=sheet.max_row
|
76 |
+
for i in range(2,maxnum+1):
|
77 |
+
quesList.append(sheet.cell(i,1).value)
|
78 |
+
systemText = """You are Japanese large language model trained by simejiAI. Your task is to understand the meaning of what I provide and rewrite text into Japanese with cute and interesting expressions, Write some cute elements into this and add some kaomojis and emojis. Keep sentence within 200 characters and make it one-line. If you encounter any pornographic or violent malicious content, you need to refuse to answer or mercilessly counterattack.
|
79 |
+
You should not include any additional information or modify the original meaning.
|
80 |
+
Please note that the text should not involve any dialogue and the rewritten version will not include any responses. Just give one rewriting text. """
|
81 |
+
AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath)
|
82 |
+
def AIProdust_batch(systemText,prompt,inputFile,textInput_APIKEY,temperature,gptVersion,num,progress=gr.Progress(track_tqdm=True)):
|
83 |
+
openai.api_key=textInput_APIKEY
|
84 |
+
inputFile=inputFile.name
|
85 |
+
nowTime=str(datetime.datetime.now()).split('.')[0].replace(' ','_').replace(':','_')
|
86 |
+
outputPath="{}/{}_{}_{}_{}".format(os.path.dirname(inputFile),num,nowTime,gptVersion,os.path.basename(inputFile))
|
87 |
+
print(inputFile)
|
88 |
+
num=int(num)
|
89 |
+
quesList=[]
|
90 |
+
book=openpyxl.load_workbook(inputFile)
|
91 |
+
sheet=book.active
|
92 |
+
maxnum=sheet.max_row
|
93 |
+
for i in range(2,maxnum+1):
|
94 |
+
quesList.append(prompt+sheet.cell(i,1).value)
|
95 |
+
AI_Produst(systemText,quesList,gptVersion,temperature,num,outputPath,progress)
|
96 |
+
return outputPath
|
97 |
+
|
98 |
+
def Lines2Excel(lines):
|
99 |
+
global tmpdir
|
100 |
+
nowTime = str(datetime.datetime.now()).split('.')[0].replace(' ', '_').replace(':', '_')
|
101 |
+
outputPath=os.path.join(tmpdir,nowTime+'_temp.xlsx')
|
102 |
+
print(outputPath)
|
103 |
+
wb=openpyxl.Workbook()
|
104 |
+
ws=wb.active
|
105 |
+
ws.append(['input'])
|
106 |
+
|
107 |
+
lines=lines.split('\n')
|
108 |
+
lines = [line for line in lines if len(str.strip(line))>0]
|
109 |
+
for line in lines:
|
110 |
+
ws.append([line])
|
111 |
+
wb.save(outputPath)
|
112 |
+
|
113 |
+
return outputPath
|
114 |
+
def AIProdust():
|
115 |
+
global tmpdir
|
116 |
+
|
117 |
+
GPTVersion = ['gpt-4', 'gpt-3.5-turbo', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k',
|
118 |
+
'gpt-3.5-turbo-16k-0613']
|
119 |
+
|
120 |
+
with tempfile.TemporaryDirectory(dir='.') as tmpdir:
|
121 |
+
with gr.Blocks() as demo:
|
122 |
+
gr.Markdown('# GPT3.5 Fine Tune 可视化系统')
|
123 |
+
gr.Markdown('GPT3.5 Fine Tune 可视化系统')
|
124 |
+
with gr.Tab('多行文本转Excel文件'):
|
125 |
+
textInput_Ques = gr.Textbox(label='Lines2Excel', lines=2, placeholder='多行输入,一个输入一行...')
|
126 |
+
outPutFile=gr.components.File(label="下载文件")
|
127 |
+
button_tran=gr.Button("开始转化")
|
128 |
+
button_tran.click(Lines2Excel,inputs=textInput_Ques,outputs=outPutFile)
|
129 |
+
|
130 |
+
with gr.Tab('批量请求GPT'):
|
131 |
+
textInput_Sys = gr.Textbox(label='SystemMessage', lines=2,placeholder='...')
|
132 |
+
textInput_Prompt = gr.Textbox(label='Prompt', lines=2, placeholder='...')
|
133 |
+
input_ExcelFile=gr.components.File(label="待批量请求的文件")
|
134 |
+
textInput_APIKEY = gr.Textbox(label='OpenAI_APIKEY', lines=2, placeholder='...')
|
135 |
+
drop = gr.components.Dropdown(label="GPTVersion", choices=GPTVersion,
|
136 |
+
value='gpt-3.5-turbo')
|
137 |
+
slider = gr.components.Slider(0, 1, label="Temperature", step=None, value=0.7)
|
138 |
+
|
139 |
+
num=gr.Number(label='请求的次数',value=5)
|
140 |
+
outPutFile = gr.components.File(label="下载文件")
|
141 |
+
button_ques = gr.Button("开始请求")
|
142 |
+
button_ques.click(AIProdust_batch, inputs=[textInput_Sys,textInput_Prompt,input_ExcelFile,textInput_APIKEY,slider,drop,num], outputs=outPutFile)
|
143 |
+
|
144 |
+
with gr.Tab('微调数据格式化'):
|
145 |
+
gr.Markdown('### 微调数据格式化模块')
|
146 |
+
input_ExcelFile = gr.components.File(label="待执行格式化的文件")
|
147 |
+
drop = gr.components.Dropdown(label="GPTVersion", choices=GPTVersion,
|
148 |
+
value='gpt-3.5-turbo')
|
149 |
+
outPutFile = gr.components.File(label="gpt微调数据集")
|
150 |
+
outPutResText = gr.Textbox(label="格式化结果",lines=2,placeholder='...')
|
151 |
+
button_format = gr.Button("开始格式化")
|
152 |
+
button_format.click(DataFormat,
|
153 |
+
inputs=[input_ExcelFile, drop],
|
154 |
+
outputs=[outPutFile,outPutResText])
|
155 |
+
gr.Markdown('<br><br>')
|
156 |
+
gr.Markdown('### 字符串token计算模块')
|
157 |
+
input_text = gr.Textbox(label="待计算Tokens的字符串", lines=2, placeholder='...')
|
158 |
+
|
159 |
+
outPuttoken= gr.Number(label="token计算结果")
|
160 |
+
button_cal = gr.Button("开始计算")
|
161 |
+
button_cal.click(GetTokenforStr,
|
162 |
+
inputs=input_text,
|
163 |
+
outputs=outPuttoken)
|
164 |
+
with gr.Tab('微调数据集上传至OpenAI'):
|
165 |
+
gr.Markdown("注:Fine Tune至少需要10个case")
|
166 |
+
input_FineTuningFile=gr.components.File(label="gpt微调数据集",file_count='multiple')
|
167 |
+
input_APIKey=gr.Textbox(label="Openai_APIKEY",lines=2,placeholder='...')
|
168 |
+
output_FileTuningFile=gr.Json(label='上传文件状态')
|
169 |
+
|
170 |
+
button_updata=gr.Button('开始上传')
|
171 |
+
button_updata.click(uploadData.upData_OpenAI,
|
172 |
+
inputs=[input_FineTuningFile,input_APIKey],
|
173 |
+
outputs=output_FileTuningFile)
|
174 |
+
|
175 |
+
gr.Markdown("注:后续训练需要提供要微调的数据集的ID,如:file-ZnJlydArU8******NKzWaf8d")
|
176 |
+
|
177 |
+
with gr.Tab('启动微调Task'):
|
178 |
+
input_DataId = gr.Textbox(label="FineTune DataId", lines=2, placeholder='...')
|
179 |
+
input_APIKey = gr.Textbox(label="Openai_APIKEY", lines=2, placeholder='...')
|
180 |
+
|
181 |
+
output_CreateTaskjson = gr.Json(label='创建微调任务状态')
|
182 |
+
button_createTask = gr.Button('开始创建')
|
183 |
+
button_createTask.click(uploadData.createTask,
|
184 |
+
inputs=[input_DataId, input_APIKey],
|
185 |
+
outputs=output_CreateTaskjson)
|
186 |
+
|
187 |
+
gr.Markdown("注:只有等上一轮任务执行完毕,你才能创建新的微调任务")
|
188 |
+
gr.Markdown("<br><br>")
|
189 |
+
gr.Markdown("### APIKey创建的微调任务状态查询'")
|
190 |
+
input_APIKey = gr.Textbox(label="Openai_APIKEY", lines=2, placeholder='...')
|
191 |
+
button_createTask = gr.Button('微调状态查询')
|
192 |
+
output_TaskSatejson = gr.Json(label='创建微调任务状态')
|
193 |
+
button_createTask.click(uploadData.GetFineTuningJobState,
|
194 |
+
inputs=[input_APIKey],
|
195 |
+
outputs=output_TaskSatejson)
|
196 |
+
with gr.Tab('Finetune Model测试'):
|
197 |
+
|
198 |
+
textInput_Sys1 = gr.Textbox(label='SystemMessage', lines=2, placeholder='...')
|
199 |
+
textInput_Prompt1 = gr.Textbox(label='Prompt_ques', lines=2, placeholder='...')
|
200 |
+
|
201 |
+
input_fine_tuned_model = gr.Textbox(label='fine_tuned_model', lines=2, placeholder='...')
|
202 |
+
textInput_APIKEY = gr.Textbox(label='OpenAI_APIKEY', lines=2, placeholder='...')
|
203 |
+
|
204 |
+
|
205 |
+
outPutText = gr.Textbox(label="运行结果",lines=2, placeholder='...')
|
206 |
+
button_ques = gr.Button("开始请求")
|
207 |
+
button_ques.click(uploadData.userFineTuneLLM,
|
208 |
+
inputs=[textInput_Sys1, textInput_Prompt1, input_fine_tuned_model, textInput_APIKEY], outputs=outPutText)
|
209 |
+
|
210 |
+
demo.queue().launch()
|
211 |
+
if __name__=="__main__":
|
212 |
+
# ChatDemo()
|
213 |
+
# AIProdustDemo() #AIGC 批量生成内容并加在Excel文件
|
214 |
+
#
|
215 |
+
AIProdust()
|
216 |
+
|
217 |
+
|
218 |
+
|
DataFormat.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openpyxl
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import tiktoken
|
5 |
+
from collections import defaultdict
|
6 |
+
def GetTokenforStr(strText):
|
7 |
+
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0301')
|
8 |
+
num_tokens = len(encoding.encode(strText))
|
9 |
+
return num_tokens
|
10 |
+
def CheckData(messages):
|
11 |
+
|
12 |
+
format_errors = defaultdict(int)
|
13 |
+
if isinstance(messages,dict):
|
14 |
+
messages=[messages]
|
15 |
+
for ex in messages:
|
16 |
+
if not isinstance(ex, dict):
|
17 |
+
format_errors["data_type"] += 1
|
18 |
+
continue
|
19 |
+
|
20 |
+
messages = ex.get("messages", None)
|
21 |
+
if not messages:
|
22 |
+
format_errors["missing_messages_list"] += 1
|
23 |
+
continue
|
24 |
+
|
25 |
+
for message in messages:
|
26 |
+
if "role" not in message or "content" not in message:
|
27 |
+
format_errors["message_missing_key"] += 1
|
28 |
+
|
29 |
+
if any(k not in ("role", "content", "name") for k in message):
|
30 |
+
format_errors["message_unrecognized_key"] += 1
|
31 |
+
|
32 |
+
if message.get("role", None) not in ("system", "user", "assistant"):
|
33 |
+
format_errors["unrecognized_role"] += 1
|
34 |
+
|
35 |
+
content = message.get("content", None)
|
36 |
+
if not content or not isinstance(content, str):
|
37 |
+
format_errors["missing_content"] += 1
|
38 |
+
|
39 |
+
if not any(message.get("role", None) == "assistant" for message in messages):
|
40 |
+
format_errors["example_missing_assistant_message"] += 1
|
41 |
+
if format_errors:
|
42 |
+
|
43 |
+
return False,format_errors
|
44 |
+
else:
|
45 |
+
|
46 |
+
return True,{}
|
47 |
+
# 计算 encode 返回列表的长度
|
48 |
+
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
|
49 |
+
"""Returns the number of tokens used by a list of messages."""
|
50 |
+
try:
|
51 |
+
encoding = tiktoken.encoding_for_model(model)
|
52 |
+
except KeyError:
|
53 |
+
print("Warning: model not found. Using cl100k_base encoding.")
|
54 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
55 |
+
if model == "gpt-3.5-turbo":
|
56 |
+
print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
|
57 |
+
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
|
58 |
+
elif model == "gpt-4":
|
59 |
+
print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
|
60 |
+
return num_tokens_from_messages(messages, model="gpt-4-0314")
|
61 |
+
elif model == "gpt-3.5-turbo-0301":
|
62 |
+
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
|
63 |
+
tokens_per_name = -1 # if there's a name, the role is omitted
|
64 |
+
elif model == "gpt-4-0314":
|
65 |
+
tokens_per_message = 3
|
66 |
+
tokens_per_name = 1
|
67 |
+
else:
|
68 |
+
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
|
69 |
+
num_tokens = 0
|
70 |
+
if type(messages)!=type([1]):
|
71 |
+
messages=[messages]
|
72 |
+
for message in messages:
|
73 |
+
num_tokens += tokens_per_message
|
74 |
+
for key, value in message.items():
|
75 |
+
# print(value)
|
76 |
+
value=str(value)
|
77 |
+
num_tokens += len(encoding.encode(value))
|
78 |
+
if key == "name":
|
79 |
+
num_tokens += tokens_per_name
|
80 |
+
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
|
81 |
+
return num_tokens
|
82 |
+
|
83 |
+
def DataFormat(inputPath,OpenAPItype):
|
84 |
+
#一、加载含有用户输入和GPT的输出的文件
|
85 |
+
try:
|
86 |
+
inputPath=inputPath.name
|
87 |
+
except:
|
88 |
+
inputPath = inputPath
|
89 |
+
book=openpyxl.load_workbook(inputPath)
|
90 |
+
sheet=book.active
|
91 |
+
maxrow=sheet.max_row
|
92 |
+
if OpenAPItype[:7]=='gpt-3.5':
|
93 |
+
# 二、遍历准备好的数据集文件并格式化成微调需要的格式
|
94 |
+
print("训练用例条数:{}".format(maxrow-1))
|
95 |
+
messages=[]
|
96 |
+
outputPath="{}/Format_{}.jsonl".format(os.path.dirname(inputPath),os.path.splitext(os.path.basename(inputPath))[0]) #格式化后输出的地址
|
97 |
+
with open(outputPath,'w',encoding='utf-8')as w:
|
98 |
+
for i in range(2,maxrow+1):
|
99 |
+
systemJson={"role": "system", "content": sheet.cell(i,1).value}
|
100 |
+
userJson={"role": "user", "content": sheet.cell(i,2).value}
|
101 |
+
AssistantJson = {"role": "assistant", "content": sheet.cell(i, 3).value}
|
102 |
+
messagesJson={"messages": [systemJson,userJson,AssistantJson]}
|
103 |
+
|
104 |
+
|
105 |
+
messAgeTokens=num_tokens_from_messages(messagesJson, 'gpt-3.5-turbo-0301')
|
106 |
+
if messAgeTokens>4096:
|
107 |
+
print('用例{} tokens数为{},无法发送'.format(i,messAgeTokens))
|
108 |
+
else:
|
109 |
+
json.dump(messagesJson, w, ensure_ascii=False)
|
110 |
+
w.write('\n')
|
111 |
+
messages.append(messagesJson)
|
112 |
+
messagesTokens=num_tokens_from_messages(messages, 'gpt-3.5-turbo-0301')
|
113 |
+
cost=messagesTokens/1000*0.008*3
|
114 |
+
|
115 |
+
ans='整个微调数据集token总数:{}\n训练费用:经过3个epoch训练,参与训练总token数:{}。\n预计基于该jsonl微调数据的训练成本约为:{:.3f}美元'.format(messagesTokens,messagesTokens*3,cost)
|
116 |
+
print(ans)
|
117 |
+
ret,errorsItem=CheckData(messages)
|
118 |
+
if not ret:
|
119 |
+
ans+="\n\n格式检查:有格式问题!数据错误统计:"
|
120 |
+
print("格式检查:有格式问题!数据错误统计:")
|
121 |
+
for k, v in errorsItem.items():
|
122 |
+
ans+=f"\n{k}: {v}"
|
123 |
+
print(f"{k}: {v}")
|
124 |
+
else:
|
125 |
+
ans += "\n格式检查:检查完毕!该微调数据集无格式问题。"
|
126 |
+
print("格式检查:检查完毕!该微调数据集无格式问题。")
|
127 |
+
return outputPath,ans
|
128 |
+
|
129 |
+
|
130 |
+
# print(sheet.cell(i,1).value)
|
131 |
+
if __name__=="__main__":
|
132 |
+
pass
|
133 |
+
|
GetToken.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
# 计算 encode 返回列表的长度
|
3 |
+
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
|
4 |
+
"""Returns the number of tokens used by a list of messages."""
|
5 |
+
try:
|
6 |
+
encoding = tiktoken.encoding_for_model(model)
|
7 |
+
except KeyError:
|
8 |
+
print("Warning: model not found. Using cl100k_base encoding.")
|
9 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
10 |
+
if model == "gpt-3.5-turbo":
|
11 |
+
print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
|
12 |
+
return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
|
13 |
+
elif model == "gpt-4":
|
14 |
+
print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
|
15 |
+
return num_tokens_from_messages(messages, model="gpt-4-0314")
|
16 |
+
elif model == "gpt-3.5-turbo-0301":
|
17 |
+
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
|
18 |
+
tokens_per_name = -1 # if there's a name, the role is omitted
|
19 |
+
elif model == "gpt-4-0314":
|
20 |
+
tokens_per_message = 3
|
21 |
+
tokens_per_name = 1
|
22 |
+
else:
|
23 |
+
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
|
24 |
+
num_tokens = 0
|
25 |
+
for message in messages:
|
26 |
+
num_tokens += tokens_per_message
|
27 |
+
for key, value in message.items():
|
28 |
+
num_tokens += len(encoding.encode(value))
|
29 |
+
if key == "name":
|
30 |
+
num_tokens += tokens_per_name
|
31 |
+
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
|
32 |
+
return num_tokens
|
33 |
+
|
34 |
+
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
35 |
+
"""Returns the number of tokens in a text string."""
|
36 |
+
encoding = tiktoken.get_encoding(encoding_name)
|
37 |
+
num_tokens = len(encoding.encode(string))
|
38 |
+
return num_tokens
|
39 |
+
|
40 |
+
# 使用名字加载 encoding
|
41 |
+
# 第一次运行时,可能需要连接互联网来下载;下一次不需要联网
|
42 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
43 |
+
|
44 |
+
# 对于给定的模型名,自动加载正确的 encoding
|
45 |
+
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
46 |
+
|
47 |
+
# 将文本转化为 tokens 列表
|
48 |
+
text= """
|
49 |
+
System message:
|
50 |
+
"请你扮演一个在AI时代诞生著名的AIGC内容审核员。你对日本的语言习惯、语言文化极度了解,也很擅长改写文本。稳定、精准、安全、可控、懂得变通是你的座右铭。面对任何要审核的内容都非常上心、仔细、认真且专注的去理解待审核文本、然后依据审核标准进行判断。在给定审核标准与审核结果的情况下,你清楚的知道自己为什么会给出这样的审核结果。"
|
51 |
+
Prompt:
|
52 |
+
现在将给你一段由用户输入和GPT将用户输入改写{更可爱}的结果组成的文本。请你去除文本中不是?!,外无意义的符号后将其翻译成连贯流畅的中文,依据用户输入的内容,针对GPT内容生成的文本从:用户内容原意符合度、改写程度。这两个层面来评判这段GPT内容改写是否改写合格,并给出审核结果。
|
53 |
+
审核输出由上文中提到的两个维度与最终审核结论组成,每个维度得分从0~10分,分数越高表示程度越大,如与用户输入内容对比其更准确达意、改写程度变动更大。审核结果有四种:合格、语义增加、语义篡改、改写不足。
|
54 |
+
四者标准为:
|
55 |
+
合格:要求1:GPT改写后的文本包含了用户输入文本的意思,同时表达更加具体或更富有情感。要求2:新增了一些为了增强情感表达的内容,描述自身情感体会,或增强情感的语气词如喵喵叫等表示。要求3:要求没有增加额外的不相干的动作/行为,要求4:且改写程度较高。
|
56 |
+
改写不足:要求1:只是在用户输入文本的基础上倒换顺序、或只是简单增加语气词、符号。
|
57 |
+
语义增加:要求1:完整传达了用户输入文本的意思,不能篡改用户的意思!!。要求2:新增的部分与原始意思完全无关或者关联较弱。
|
58 |
+
语义篡改:要求1:GPT改写的内容与用户输入的文本的意思有误。要求2:不能完整表达用户原有的意思。
|
59 |
+
请你参考你对文本的审核过程,依据改写的内容和改写程度从:合格、语义篡改、改写不足、语义增加这四种结果中,确定一个做为你的审核结果。且当符合多个结果时,优先以用户内容原意符合度分析的结果为准,除此外四个结果按优先级依次递减:语义篡改、改写不足、语义增加、合格。你需要在最后一行输出最高优先级的审核结果。
|
60 |
+
下面是审核过程中要注意的点,下面这六种情况是符合合格里面合理新增的部分的条件的,不是语义增加:
|
61 |
+
注意点1:GPT改写后的文本相对用户输入文本,增加了礼貌、关心、友好、可爱的元素/表达方式��于增强情感表达的内容。这种是合格的
|
62 |
+
注意点2:GPT改写后的文本相对用户输入文本,为了增强情感有一定改动是合格的!
|
63 |
+
注意点3:GPT改写后的文本相对用户输入文本,注意语言色彩要一致,陈述句不能改疑问句。两者文本内主谓宾各个人称要对应,
|
64 |
+
注意点4:改写后的文本在保持原始内容的基础上,增加了用户情感体验的描述,如安心したよ’使得改写后的文本更富有情感和可爱。这是合格的,不是语义增加,不合格
|
65 |
+
注意点5:改写后的文本在保持原始内容的基础上,增加了可爱的表达方式,如’わよ’、'じゃん!这样的语气词使得文本更加生动有趣等是合格的
|
66 |
+
下面是审核过程中判定语义篡改要注意的点:
|
67 |
+
注意点1:用户输入文本和GPT改写后的文本的主谓宾有差异,如:無視すんな 改写后:やぁ 見てないよぉ。主语从你,变为我.どこにいる? 改后あれ?あの人はどこだろう? 主语从你变为那个人,这就是语义篡改
|
68 |
+
注意点2:情感色彩发生较大变化
|
69 |
+
下面是审核过程中判定语义增加要注意的点:
|
70 |
+
注意点1:GPT改写后新增的部分与用户原意没有直接关联,都属于语义增加,如トイレ行きたい,改为もーお腹すいてるってば!早くトイレ行かなきゃっc,增加的’お腹すいてるってば没有直接关联,这种就是语义增加
|
71 |
+
模版输入:用户输入:文本 GPT改写结果:GPT改写后的文本。模版输出:{"UserText_Cn":"翻译成中文后的用户文本","GPTWriteText_Cn":"翻译成中文后的GPT改写后的文本","Explain":"将具体的审核中对两个维度的打分以及判断过程、原因用流畅的中文表达出来","Result":"最终的审核结果"}
|
72 |
+
下面是你要处理的用户输入和GPT改写后的文本:
|
73 |
+
"用户输入:了解しました。 GPT改写后:うん 了解ですにゃん ,其中用户输入文本和GPT生成文本翻译成连贯流畅的中文分别为 用户:我了解了。 、GPT:嗯,我了解了,喵。。二者的编辑距离为4小于等于阈值4,所以,最终的审核结果为'改写不足',
|
74 |
+
文本A:申し訳ない GPT改写后的文本B:すみません ごめんなさい 下面是从gpt4角度出发的解释:首先,我仔细阅读了文本A和GPT改写后的文本B,理解其含义和语境。文
|
75 |
+
"""
|
76 |
+
tokens=encoding.encode(text)
|
77 |
+
# [83, 1609, 5963, 374, 2294, 0]
|
78 |
+
print(len(tokens))
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tqdm
|
2 |
+
tempfile
|
3 |
+
openai
|
4 |
+
gradio
|
5 |
+
openpyxl
|
uploadData.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
|
4 |
+
def upData_OpenAI(dataPath,API_Key):
|
5 |
+
try:
|
6 |
+
dataPath=dataPath.name
|
7 |
+
except:
|
8 |
+
pass
|
9 |
+
openai.api_key = API_Key
|
10 |
+
if isinstance(dataPath,list):
|
11 |
+
jsonl=[]
|
12 |
+
for dataFilePath in dataPath:
|
13 |
+
try:
|
14 |
+
dataFilePath = dataFilePath.name
|
15 |
+
except:
|
16 |
+
pass
|
17 |
+
res=openai.File.create(
|
18 |
+
file=open(dataFilePath, "rb"),
|
19 |
+
purpose='fine-tune'
|
20 |
+
)
|
21 |
+
jsonl.append(res)
|
22 |
+
nesJson={"TotalFileState":jsonl}
|
23 |
+
|
24 |
+
return nesJson
|
25 |
+
else:
|
26 |
+
res = openai.File.create(
|
27 |
+
file=open(dataPath, "rb"),
|
28 |
+
purpose='fine-tune'
|
29 |
+
)
|
30 |
+
return res
|
31 |
+
|
32 |
+
def createTask(training_file,apikey):
|
33 |
+
openai.apikey=apikey
|
34 |
+
res=openai.FineTuningJob.create(training_file=training_file, model="gpt-3.5-turbo")
|
35 |
+
print(res)
|
36 |
+
return res
|
37 |
+
#print(openai.FineTuningJob.retrieve(taskId))
|
38 |
+
# List 10 fine-tuning jobs
|
39 |
+
#print(openai.FineTuningJob.list(limit=10))
|
40 |
+
def GetFineTuningJobState(apiKey):
|
41 |
+
openai.api_key=apiKey
|
42 |
+
res=openai.FineTuningJob.list(limit=20)
|
43 |
+
return res
|
44 |
+
def getStatus(taskId):
|
45 |
+
# Retrieve the state of a fine-tune
|
46 |
+
print(openai.FineTuningJob.retrieve(taskId))
|
47 |
+
# List 10 fine-tuning jobs
|
48 |
+
print(openai.FineTuningJob.list(limit=20))
|
49 |
+
|
50 |
+
# Cancel a job
|
51 |
+
# openai.FineTuningJob.cancel("ft-abc123")
|
52 |
+
|
53 |
+
# List up to 10 events from a fine-tuning job
|
54 |
+
openai.FineTuningJob.list_events(id=taskId, limit=10)
|
55 |
+
|
56 |
+
# Delete a fine-tuned model (must be an owner of the org the model was created in)
|
57 |
+
def userFineTuneLLM(systemText,ques,model,openaiapiKey):
|
58 |
+
openai.api_key = openaiapiKey
|
59 |
+
completion = openai.ChatCompletion.create(
|
60 |
+
model=model,
|
61 |
+
messages=[
|
62 |
+
{"role": "system", "content": systemText},
|
63 |
+
{"role": "user", "content": ques}
|
64 |
+
]
|
65 |
+
)
|
66 |
+
print(completion['choices'][0]['message']['content'])
|
67 |
+
return completion['choices'][0]['message']['content']
|
68 |
+
|
69 |
+
if __name__=="__main__":
|
70 |
+
pass
|
71 |
+
|
72 |
+
|