Spaces:
Build error
Build error
File size: 3,891 Bytes
9f97c26 abe31cd 9f97c26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import json
from typing import List
import fastapi
import markdown
import uvicorn
from ctransformers import AutoModelForCausalLM
from fastapi import HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from pydantic import BaseModel, Field
from typing_extensions import Literal
from dialogue import DialogueTemplate
llm = AutoModelForCausalLM.from_pretrained("gsaivinay/airoboros-13B-gpt4-1.3-GGML",
model_file="airoboros-13b-gpt4-1.3.ggmlv3.q4_1.bin",
model_type="llama")
app = fastapi.FastAPI(title="Starchat Beta")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
with open("README.md", "r", encoding="utf-8") as readme_file:
md_template_string = readme_file.read()
html_content = markdown.markdown(md_template_string)
return HTMLResponse(content=html_content, status_code=200)
@app.get("/stream")
async def chat(prompt = "<|user|> Write an express server with server sent events. <|assistant|>"):
tokens = llm.tokenize(prompt)
async def server_sent_events(chat_chunks, llm):
yield prompt
for chat_chunk in llm.generate(chat_chunks):
yield llm.detokenize(chat_chunk)
yield ""
return EventSourceResponse(server_sent_events(tokens, llm))
class ChatCompletionRequestMessage(BaseModel):
role: Literal["system", "user", "assistant"] = Field(
default="user", description="The role of the message."
)
content: str = Field(default="", description="The content of the message.")
class ChatCompletionRequest(BaseModel):
messages: List[ChatCompletionRequestMessage] = Field(
default=[], description="A list of messages to generate completions for."
)
system_message = "Below is a conversation between a human user and a helpful AI coding assistant."
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest):
kwargs = request.dict()
dialogue_template = DialogueTemplate(
system=system_message, messages=kwargs['messages']
)
prompt = dialogue_template.get_inference_prompt()
tokens = llm.tokenize(combined_messages)
try:
chat_chunks = llm.generate(tokens)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
async def format_response(chat_chunks: Generator) -> Any:
for chat_chunk in chat_chunks:
response = {
'choices': [
{
'message': {
'role': 'system',
'content': llm.detokenize(chat_chunk)
},
'finish_reason': 'stop' if llm.detokenize(chat_chunk) == "[DONE]" else 'unknown'
}
]
}
yield f"data: {json.dumps(response)}\n\n"
yield "event: done\ndata: {}\n\n"
return EventSourceResponse(format_response(chat_chunks), media_type="text/event-stream")
@app.post("/v0/chat/completions")
async def chatV0(request: ChatCompletionRequest, response_mode=None):
kwargs = request.dict()
dialogue_template = DialogueTemplate(
system=system_message, messages=kwargs['messages']
)
prompt = dialogue_template.get_inference_prompt()
tokens = llm.tokenize(prompt)
async def server_sent_events(chat_chunks, llm):
for token in llm.generate(chat_chunks):
yield dict(data=llm.detokenize(token))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000) |