File size: 4,177 Bytes
d19a99e
 
 
 
84b695c
 
84009f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84b695c
 
d19a99e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from fastapi import FastAPI, Request, Response
from pydantic import BaseModel
from webscout.LLM import LLM
from typing import Union, Dict, List, Any
app = FastAPI()

class Model(BaseModel):
    id: str
    object: str
    created: int
    owned_by: str

class Message(BaseModel):
    role: str
    content: str

class CompletionRequest(BaseModel):
    model: str
    messages: List[Message]

class CompletionResponse(BaseModel):
    id: str
    object: str
    created: int
    model: str
    choices: List[Dict[str, Any]]
    usage: Dict[str, int]

models = [
    {"id": "meta-llama/Meta-Llama-3-70B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"},
    {"id": "google/gemma-2-27b-it", "object": "model", "created": 1686935002, "owned_by": "meta"},
    {"id": "google/gemma-2-9b-it", "object": "model", "created": 1686935002, "owned_by": "ConsiousAI"},
    {"id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b", "object": "model", "created": 1686935002, "owned_by": "cognitivecomputations"},
    {"id": "nvidia/Nemotron-4-340B-Instruct", "object": "model", "created": 1686935002, "owned_by": "nvidia"},
    {"id": "Qwen/Qwen2-72B-Instruct", "object": "model", "created": 1686935002, "owned_by": "qwen"},
    {"id": "microsoft/Phi-3-medium-4k-instruct", "object": "model", "created": 1686935002, "owned_by": "microsoft"},
    {"id": "google/gemma-2-9b-it", "object": "model", "created": 1686935002, "owned_by": "ConsiousAI"},
    {"id": "openchat/openchat-3.6-8b", "object": "model", "created": 1686935002, "owned_by": "unknown"},
    {"id": "mistralai/Mistral-7B-Instruct-v0.3", "object": "model", "created": 1686935002, "owned_by": "mistral"},
    {"id": "meta-llama/Meta-Llama-3-8B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"},
    {"id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "object": "model", "created": 1686935002, "owned_by": "mistral"},
    {"id": "mistralai/Mixtral-8x7B-Instruct-v0.1", "object": "model", "created": 1686935002, "owned_by": "mistral"},
    {"id": "Qwen/Qwen2-7B-Instruct", "object": "model", "created": 1686935002, "owned_by": "Qwen"},
    {"id": "meta-llama/Meta-Llama-3.1-405B-Instruct", "object": "model", "created": 1686935002, "owned_by": "meta"}

]

@app.post("/v1/chat/completions/")
def handle_completions(completion_request: CompletionRequest):
    system_prompt = next((message.content for message in completion_request.messages if message.role == 'system'), None)
    user_query = next((message.content for message in completion_request.messages if message.role == 'user'), None)

    response_text = generative(query=user_query, system_prompt=system_prompt, model=completion_request.model)

    response = CompletionResponse(
        id="chatcmpl-1",
        object="chat.completion",
        created=1234567890,
        model=completion_request.model,
        choices=[{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
        usage={"prompt_tokens": sum(len(message.content.split()) for message in completion_request.messages), "total_tokens": sum(len(message.content.split()) for message in completion_request.messages) + len(response_text.split())}
    )
    return response

@app.get("/v1/models/")
def get_models():
    return {"object": "list", "data": models}

@app.post("/v1/completions/")
def create_completion(prompt: str, model: str, best_of: int = 1, echo: bool = False, frequency_penalty: float = 0.0):
    response_text = generative(prompt, "you are an helpful assistant", model)

    response = {
        "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
        "object": "text_completion",
        "created": 1589478378,
        "model": model,
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{"text": response_text, "index": 0, "logprobs": None, "finish_reason": "length"}]
    }
    return response

def generative(system_prompt, query, model):
    llm = LLM(model=model, system_message=system_prompt)
    messages = [{"role": "user", "content": query}]
    response = llm.chat(messages)
    return response

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)