# SPDX-License-Identifier: Apache-2.0

import os
# os.system('nvcc --version')
os.system('python --version')


model_url = 'http://localhost:8000/v1'
model = "sijieaaa/CodeModel-V1-3B-2024-02-07"
stop_token_ids = ''


import subprocess

# Setup LLM API
# -- llama-factory
os.system('git clone https://github.com/hiyouga/LLaMA-Factory.git && cd LLaMA-Factory && pip install -e . && cd ..')
def start_api():
    # 以非阻塞方式启动 API
    process = subprocess.Popen(
        ["lmf", "api", "--model_name", "sijieaaa/CodeModel-V1-3B-2024-02-07", "--template", "qwen"],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    # 监听 API 启动日志，检测关键字
    while True:
        line = process.stdout.readline()
        if not line:
            break  # 没有新输出，继续等待
        print(line.strip())  # 打印 API 启动日志
        if "Running on local URL:" in line or "API started" in line or "http://localhost:" in line:
            print("✅ API 启动成功！")
            return process  # 返回 API 进程
    print("❌ API 启动失败！")
    process.terminate()
    return None
api_process = start_api()
if api_process:
    print("🎯 执行后续操作……")
else:
    raise Exception("API 启动失败！")


# Setup OpenAI API client
os.system('pip install openai')
from openai import OpenAI
openai_api_key = '0'
openai_api_base = model_url
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)


def predict(message, history):
    # Convert chat history to OpenAI format
    history_openai_format = [{
        "role": "system",
        "content": "You are a great ai assistant."
    }]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
        history_openai_format.append({
            "role": "assistant",
            "content": assistant
        })
    history_openai_format.append({"role": "user", "content": message})

    # Create a chat completion request and send it to the API server
    stream = client.chat.completions.create(
        model=model,  # Model name to use
        messages=history_openai_format,  # Chat history
        temperature=0.95,  # Temperature for text generation
        stream=True,  # Stream response
        top_p=0.7,
        extra_body={
            'repetition_penalty':
            1,
            'stop_token_ids': [
                int(id.strip()) for id in stop_token_ids.split(',')
                if id.strip()
            ] if stop_token_ids else []
        })

    # Read and return generated text from response stream
    partial_message = ""
    for chunk in stream:
        partial_message += (chunk.choices[0].delta.content or "")
        yield partial_message


import gradio as gr
# Create and launch a chat interface with Gradio
gr.ChatInterface(predict).queue().launch(
    # server_name=host,
    # server_port=port,
    share=True
    )