Commit
·
3db8af6
1
Parent(s):
b63d65d
fix tokenization
Browse files- tokenization_functionary.py +32 -28
- tokenizer_config.json +1 -1
tokenization_functionary.py
CHANGED
@@ -38,10 +38,8 @@ class Tool(BaseModel):
|
|
38 |
|
39 |
def convert_data_type(param_type: str) -> str:
|
40 |
"""convert data_type to typescript data type
|
41 |
-
|
42 |
Args:
|
43 |
param_type (str): param_type
|
44 |
-
|
45 |
Returns:
|
46 |
str: param type in typescript
|
47 |
"""
|
@@ -52,10 +50,8 @@ def convert_data_type(param_type: str) -> str:
|
|
52 |
|
53 |
def get_param_type(param: Dict) -> str:
|
54 |
"""get param_type of parameter
|
55 |
-
|
56 |
Args:
|
57 |
param (Dict): param dict in properties
|
58 |
-
|
59 |
Returns:
|
60 |
str: _description_
|
61 |
"""
|
@@ -80,10 +76,8 @@ def get_param_type(param: Dict) -> str:
|
|
80 |
|
81 |
def get_format_param(param: Dict) -> Optional[str]:
|
82 |
"""Get "format" from param. There are cases where format is not directly in param but in oneOf
|
83 |
-
|
84 |
Args:
|
85 |
param (Dict): _description_
|
86 |
-
|
87 |
Returns:
|
88 |
Optional[str]: _description_
|
89 |
"""
|
@@ -101,10 +95,8 @@ def get_format_param(param: Dict) -> Optional[str]:
|
|
101 |
|
102 |
def get_param_info(param: Dict) -> Optional[str]:
|
103 |
"""get additional information about parameter such as: format, default value, min, max, ...
|
104 |
-
|
105 |
Args:
|
106 |
param (Dict): _description_
|
107 |
-
|
108 |
Returns:
|
109 |
Optional[str]: _description_
|
110 |
"""
|
@@ -150,7 +142,6 @@ def append_new_param_info(
|
|
150 |
depth: int,
|
151 |
):
|
152 |
"""Append a new parameter with comment to the info_list
|
153 |
-
|
154 |
Args:
|
155 |
info_lines (List[str]): current info_list
|
156 |
param_declaration (str): param: type
|
@@ -176,11 +167,9 @@ def append_new_param_info(
|
|
176 |
|
177 |
def get_examples_info(param_name: str, examples: List) -> List:
|
178 |
"""get information about examples provided
|
179 |
-
|
180 |
Args:
|
181 |
param_name (str): _description_
|
182 |
examples (List): _description_
|
183 |
-
|
184 |
Returns:
|
185 |
List: _description_
|
186 |
"""
|
@@ -197,10 +186,8 @@ def get_examples_info(param_name: str, examples: List) -> List:
|
|
197 |
|
198 |
def get_enum_option_str(enum_options: List) -> str:
|
199 |
"""get enum option separated by: "|"
|
200 |
-
|
201 |
Args:
|
202 |
enum_options (List): list of options
|
203 |
-
|
204 |
Returns:
|
205 |
_type_: concatenation of options separated by "|"
|
206 |
"""
|
@@ -212,12 +199,10 @@ def get_array_typescript(
|
|
212 |
param_name: Optional[str], param_dic: dict, depth: int = 0
|
213 |
) -> str:
|
214 |
"""recursive implementation for generating type script of array
|
215 |
-
|
216 |
Args:
|
217 |
param_name (Optional[str]): name of param, optional
|
218 |
param_dic (dict): param_dic
|
219 |
depth (int, optional): nested level. Defaults to 0.
|
220 |
-
|
221 |
Returns:
|
222 |
_type_: typescript of array
|
223 |
"""
|
@@ -270,12 +255,10 @@ def get_array_typescript(
|
|
270 |
def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
|
271 |
"""Recursion, returning the information about parameters including data type, description and other information
|
272 |
These kinds of information will be put into the prompt
|
273 |
-
|
274 |
Args:
|
275 |
properties (_type_): properties in parameters
|
276 |
required_params (_type_): List of required parameters
|
277 |
depth (int, optional): the depth of params (nested level). Defaults to 0.
|
278 |
-
|
279 |
Returns:
|
280 |
_type_: list of lines containing information about all parameters
|
281 |
"""
|
@@ -461,20 +444,41 @@ class FunctionaryTokenizer(PreTrainedTokenizerFast):
|
|
461 |
"point any code depending on them will stop working. We recommend setting a valid chat template before "
|
462 |
"then to ensure that this model continues working without issues."
|
463 |
)
|
464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
# Prepare tools/functions into schema
|
466 |
functions_pydantic_to_render = []
|
467 |
has_code_interpreter = False
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
|
479 |
# Compilation function uses a cache to avoid recompiling the same template
|
480 |
compiled_template = self._compile_jinja_template(chat_template)
|
|
|
38 |
|
39 |
def convert_data_type(param_type: str) -> str:
|
40 |
"""convert data_type to typescript data type
|
|
|
41 |
Args:
|
42 |
param_type (str): param_type
|
|
|
43 |
Returns:
|
44 |
str: param type in typescript
|
45 |
"""
|
|
|
50 |
|
51 |
def get_param_type(param: Dict) -> str:
|
52 |
"""get param_type of parameter
|
|
|
53 |
Args:
|
54 |
param (Dict): param dict in properties
|
|
|
55 |
Returns:
|
56 |
str: _description_
|
57 |
"""
|
|
|
76 |
|
77 |
def get_format_param(param: Dict) -> Optional[str]:
|
78 |
"""Get "format" from param. There are cases where format is not directly in param but in oneOf
|
|
|
79 |
Args:
|
80 |
param (Dict): _description_
|
|
|
81 |
Returns:
|
82 |
Optional[str]: _description_
|
83 |
"""
|
|
|
95 |
|
96 |
def get_param_info(param: Dict) -> Optional[str]:
|
97 |
"""get additional information about parameter such as: format, default value, min, max, ...
|
|
|
98 |
Args:
|
99 |
param (Dict): _description_
|
|
|
100 |
Returns:
|
101 |
Optional[str]: _description_
|
102 |
"""
|
|
|
142 |
depth: int,
|
143 |
):
|
144 |
"""Append a new parameter with comment to the info_list
|
|
|
145 |
Args:
|
146 |
info_lines (List[str]): current info_list
|
147 |
param_declaration (str): param: type
|
|
|
167 |
|
168 |
def get_examples_info(param_name: str, examples: List) -> List:
|
169 |
"""get information about examples provided
|
|
|
170 |
Args:
|
171 |
param_name (str): _description_
|
172 |
examples (List): _description_
|
|
|
173 |
Returns:
|
174 |
List: _description_
|
175 |
"""
|
|
|
186 |
|
187 |
def get_enum_option_str(enum_options: List) -> str:
|
188 |
"""get enum option separated by: "|"
|
|
|
189 |
Args:
|
190 |
enum_options (List): list of options
|
|
|
191 |
Returns:
|
192 |
_type_: concatenation of options separated by "|"
|
193 |
"""
|
|
|
199 |
param_name: Optional[str], param_dic: dict, depth: int = 0
|
200 |
) -> str:
|
201 |
"""recursive implementation for generating type script of array
|
|
|
202 |
Args:
|
203 |
param_name (Optional[str]): name of param, optional
|
204 |
param_dic (dict): param_dic
|
205 |
depth (int, optional): nested level. Defaults to 0.
|
|
|
206 |
Returns:
|
207 |
_type_: typescript of array
|
208 |
"""
|
|
|
255 |
def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
|
256 |
"""Recursion, returning the information about parameters including data type, description and other information
|
257 |
These kinds of information will be put into the prompt
|
|
|
258 |
Args:
|
259 |
properties (_type_): properties in parameters
|
260 |
required_params (_type_): List of required parameters
|
261 |
depth (int, optional): the depth of params (nested level). Defaults to 0.
|
|
|
262 |
Returns:
|
263 |
_type_: list of lines containing information about all parameters
|
264 |
"""
|
|
|
444 |
"point any code depending on them will stop working. We recommend setting a valid chat template before "
|
445 |
"then to ensure that this model continues working without issues."
|
446 |
)
|
447 |
+
|
448 |
+
PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
|
449 |
+
SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
|
450 |
+
Only execute function(s) when absolutely necessary.
|
451 |
+
Ask for the required input to:recipient==all
|
452 |
+
Use JSON for function arguments.
|
453 |
+
Respond in this format:
|
454 |
+
>>>${recipient}
|
455 |
+
${content}
|
456 |
+
Available functions:
|
457 |
+
"""
|
458 |
+
|
459 |
# Prepare tools/functions into schema
|
460 |
functions_pydantic_to_render = []
|
461 |
has_code_interpreter = False
|
462 |
+
if tools is not None:
|
463 |
+
for item in tools:
|
464 |
+
if (
|
465 |
+
"function" in item and item["function"] is not None
|
466 |
+
): # new data format: tools: [{"type": xx, "function": xxx}]
|
467 |
+
functions_pydantic_to_render.append(item["function"])
|
468 |
+
elif "type" in item and item["type"] == "code_interpreter":
|
469 |
+
has_code_interpreter = True
|
470 |
+
else:
|
471 |
+
functions_pydantic_to_render.append(item) # old format
|
472 |
+
|
473 |
+
conversation.insert(
|
474 |
+
0,
|
475 |
+
{
|
476 |
+
"role": "system",
|
477 |
+
"content": SYSTEM_CONTENT + generate_schema_from_functions(functions_pydantic_to_render),
|
478 |
+
},
|
479 |
+
)
|
480 |
+
if has_code_interpreter:
|
481 |
+
conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})
|
482 |
|
483 |
# Compilation function uses a cache to avoid recompiling the same template
|
484 |
compiled_template = self._compile_jinja_template(chat_template)
|
tokenizer_config.json
CHANGED
@@ -2050,7 +2050,7 @@
|
|
2050 |
}
|
2051 |
},
|
2052 |
"bos_token": "<|begin_of_text|>",
|
2053 |
-
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>
|
2054 |
"clean_up_tokenization_spaces": true,
|
2055 |
"eos_token": "<|eot_id|>",
|
2056 |
"legacy": true,
|
|
|
2050 |
}
|
2051 |
},
|
2052 |
"bos_token": "<|begin_of_text|>",
|
2053 |
+
"chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
|
2054 |
"clean_up_tokenization_spaces": true,
|
2055 |
"eos_token": "<|eot_id|>",
|
2056 |
"legacy": true,
|