Spaces:
Sleeping
Sleeping
File size: 8,965 Bytes
8723eb9 be8d75c 8723eb9 be8d75c 8723eb9 be8d75c 8723eb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
from lagent.llms import BaseAPIModel
from typing import List, Optional, Union
from litellm import completion
from lagent.schema import ModelStatusCode
from lagent.utils.util import filter_suffix
import os
from lagent.llms import (GPTAPI, INTERNLM2_META, HFTransformerCasualLM,
LMDeployClient, LMDeployServer)
internlm_server = dict(type=LMDeployServer,
path='internlm/internlm2_5-7b-chat',
model_name='internlm2',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_client = dict(type=LMDeployClient,
model_name='internlm2_5-7b-chat',
url='http://127.0.0.1:23333',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_hf = dict(type=HFTransformerCasualLM,
path='internlm/internlm2_5-7b-chat',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=None,
temperature=1e-6,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
# openai_api_base needs to fill in the complete chat api address, such as: https://api.openai.com/v1/chat/completions
gpt4 = dict(type=GPTAPI,
model_type='gpt-4-turbo',
key=os.environ.get('OPENAI_API_KEY', 'YOUR OPENAI API KEY'),
openai_api_base=os.environ.get(
'OPENAI_API_BASE', 'https://api.openai.com/v1/chat/completions'),
)
url = 'https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation'
qwen = dict(type=GPTAPI,
model_type='qwen-max-longcontext',
key=os.environ.get('QWEN_API_KEY', 'YOUR QWEN API KEY'),
openai_api_base=url,
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
],
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=4096,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_silicon = dict(type=GPTAPI,
model_type='internlm/internlm2_5-7b-chat',
key=os.environ.get(
'SILICON_API_KEY', 'YOUR SILICON API KEY'),
openai_api_base='https://api.siliconflow.cn/v1/chat/completions',
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
],
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
class litellmCompletion(BaseAPIModel):
"""
Args:
path (str): The path to the model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download
from ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "internlm-chat-7b",
"Qwen-7B-Chat ", "Baichuan2-7B-Chat" and so on.
tp (int): tensor parallel
pipeline_cfg (dict): config of pipeline
"""
def __init__(self,
path='',
model_name="command-r",
**kwargs):
self.model_name = model_name
super().__init__(path, **kwargs)
def generate(self,
inputs: Union[str, List[str]],
do_preprocess: bool = None,
skip_special_tokens: bool = False,
**kwargs):
"""Return the chat completions in non-stream mode.
Args:
inputs (Union[str, List[str]]): input texts to be completed.
do_preprocess (bool): whether pre-process the messages. Default to
True, which means chat_template will be applied.
skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be False.
Returns:
(a list of/batched) text/chat completion
"""
batched = True
if isinstance(inputs, str):
inputs = [inputs]
prompts = inputs
messages = [{"role": "user", "content": prompt}for prompt in prompts]
gen_params = self.update_gen_params(**kwargs)
response = completion(model=self.model_name, messages=messages)
response = [resp.message.content for resp in response.choices]
# remove stop_words
response = filter_suffix(response, self.gen_params.get('stop_words'))
if batched:
return response
return response[0]
def stream_chat(self,
inputs: List[dict],
stream: bool = True,
ignore_eos: bool = False,
skip_special_tokens: Optional[bool] = False,
timeout: int = 30,
**kwargs):
"""Start a new round conversation of a session. Return the chat
completions in stream mode.
Args:
session_id (int): the identical id of a session
inputs (List[dict]): user's inputs in this round conversation
sequence_start (bool): start flag of a session
sequence_end (bool): end flag of a session
stream (bool): return in a streaming format if enabled
ignore_eos (bool): indicator for ignoring eos
skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be False.
timeout (int): max time to wait for response
Returns:
tuple(Status, str, int): status, text/chat completion,
generated token number
"""
gen_params = self.update_gen_params(**kwargs)
max_new_tokens = gen_params.pop('max_new_tokens')
gen_params.update(max_tokens=max_new_tokens)
resp = ''
finished = False
stop_words = gen_params.get('stop_words')
if stop_words is None:
stop_words = []
messages = self.template_parser._prompt2api(inputs)
for text in completion(
self.model_name,
messages,
stream=stream,
**gen_params):
if not text.choices[0].delta.content:
continue
resp += text.choices[0].delta.content
if not resp:
continue
# remove stop_words
for sw in stop_words:
if sw in resp:
resp = filter_suffix(resp, stop_words)
finished = True
break
yield ModelStatusCode.STREAM_ING, resp, None
if finished:
break
yield ModelStatusCode.END, resp, None
litellm_completion = dict(type=litellmCompletion,
# model_name="deepseek/deepseek-chat",
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
]
)
|