CodeModel-demo / app.py
sijieaaa's picture
Update app.py
0988d38 verified
raw
history blame
2.84 kB
# SPDX-License-Identifier: Apache-2.0
import os
os.system('nvcc --version')
os.system('python --version')
model_url = 'http://localhost:8000/v1'
model = "sijieaaa/CodeModel-V1-3B-2024-02-07"
stop_token_ids = ''
import subprocess
# Setup LLM API
# -- llama-factory
def start_api():
# 以非阻塞方式启动 API
process = subprocess.Popen(
["lmf", "api", "--model_name", "sijieaaa/CodeModel-V1-3B-2024-02-07", "--template", "qwen"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
# 监听 API 启动日志,检测关键字
while True:
line = process.stdout.readline()
if not line:
break # 没有新输出,继续等待
print(line.strip()) # 打印 API 启动日志
if "Running on local URL:" in line or "API started" in line or "http://localhost:" in line:
print("✅ API 启动成功!")
return process # 返回 API 进程
print("❌ API 启动失败!")
process.terminate()
return None
api_process = start_api()
# API 启动后执行下一步
if api_process:
print("🎯 执行后续操作……")
else:
raise Exception("API 启动失败!")
# Setup OpenAI API client
from openai import OpenAI
openai_api_key = '0'
openai_api_base = model_url
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
def predict(message, history):
# Convert chat history to OpenAI format
history_openai_format = [{
"role": "system",
"content": "You are a great ai assistant."
}]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({
"role": "assistant",
"content": assistant
})
history_openai_format.append({"role": "user", "content": message})
# Create a chat completion request and send it to the API server
stream = client.chat.completions.create(
model=model, # Model name to use
messages=history_openai_format, # Chat history
temperature=0.95, # Temperature for text generation
stream=True, # Stream response
top_p=0.7,
extra_body={
'repetition_penalty':
1,
'stop_token_ids': [
int(id.strip()) for id in stop_token_ids.split(',')
if id.strip()
] if stop_token_ids else []
})
# Read and return generated text from response stream
partial_message = ""
for chunk in stream:
partial_message += (chunk.choices[0].delta.content or "")
yield partial_message
import gradio as gr
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch(
# server_name=host,
# server_port=port,
share=True
)