# SPDX-License-Identifier: Apache-2.0 import os # os.system('nvcc --version') os.system('python --version') model_url = 'http://localhost:8000/v1' model = "sijieaaa/CodeModel-V1-3B-2024-02-07" stop_token_ids = '' import subprocess # Setup LLM API # -- llama-factory os.system('git clone https://github.com/hiyouga/LLaMA-Factory.git && cd LLaMA-Factory && pip install -e . && cd ..') def start_api(): # 以非阻塞方式启动 API process = subprocess.Popen( ["lmf", "api", "--model_name", "sijieaaa/CodeModel-V1-3B-2024-02-07", "--template", "qwen"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) # 监听 API 启动日志,检测关键字 while True: line = process.stdout.readline() if not line: break # 没有新输出,继续等待 print(line.strip()) # 打印 API 启动日志 if "Running on local URL:" in line or "API started" in line or "http://localhost:" in line: print("✅ API 启动成功!") return process # 返回 API 进程 print("❌ API 启动失败!") process.terminate() return None api_process = start_api() if api_process: print("🎯 执行后续操作……") else: raise Exception("API 启动失败!") # Setup OpenAI API client os.system('pip install openai') from openai import OpenAI openai_api_key = '0' openai_api_base = model_url client = OpenAI( api_key=openai_api_key, base_url=openai_api_base, ) def predict(message, history): # Convert chat history to OpenAI format history_openai_format = [{ "role": "system", "content": "You are a great ai assistant." }] for human, assistant in history: history_openai_format.append({"role": "user", "content": human}) history_openai_format.append({ "role": "assistant", "content": assistant }) history_openai_format.append({"role": "user", "content": message}) # Create a chat completion request and send it to the API server stream = client.chat.completions.create( model=model, # Model name to use messages=history_openai_format, # Chat history temperature=0.95, # Temperature for text generation stream=True, # Stream response top_p=0.7, extra_body={ 'repetition_penalty': 1, 'stop_token_ids': [ int(id.strip()) for id in stop_token_ids.split(',') if id.strip() ] if stop_token_ids else [] }) # Read and return generated text from response stream partial_message = "" for chunk in stream: partial_message += (chunk.choices[0].delta.content or "") yield partial_message import gradio as gr # Create and launch a chat interface with Gradio gr.ChatInterface(predict).queue().launch( # server_name=host, # server_port=port, share=True )