File size: 2,546 Bytes
bebe766
98d2712
d75ac70
bebe766
98d2712
 
 
 
25f269c
25d3011
25f269c
98d2712
 
d75ac70
 
98d2712
 
0f1a312
 
 
 
98d2712
 
25d3011
 
 
bebe766
0f1a312
25d3011
98d2712
 
 
 
0f1a312
 
 
 
 
 
 
25d3011
 
 
 
 
 
 
 
 
0f1a312
 
e6bc530
 
 
 
 
0f1a312
 
25f269c
25d3011
 
25f269c
 
0f1a312
 
 
 
25f269c
98d2712
bebe766
 
 
98d2712
 
 
73b6736
98d2712
e6bc530
 
 
 
 
0f1a312
 
 
 
52612f6
0f1a312
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from fastapi import FastAPI
import os
from typing import Union

from custom_llm import CustomLLM

from pydantic import BaseModel
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEndpoint


class ConversationPost(BaseModel):
    tenant: Union[str, None] = None
    module: Union[str, None] = None
    question: str

class InferencePost(BaseModel):
    question: str
    with_template: Union[str, None] = None


API_TOKEN = os.environ['HF_API_KEY']

os.environ["HUGGINGFACEHUB_API_TOKEN"] = API_TOKEN

app = FastAPI()
prompt_qwen = PromptTemplate.from_template("""<|im_start|>system
Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
""")

prompt_llama = PromptTemplate.from_template("""<|start_header_id|>system<|end_header_id|>

Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""")
# llm = prompt | HuggingFacePipeline.from_model_id(
#     model_id="Qwen/Qwen2-1.5B-Instruct",
#     task="text-generation",
#     pipeline_kwargs={
#         "max_new_tokens": 150,
#         "return_full_text":False
#     },
# )

llama = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation",
    max_new_tokens=150,
    do_sample=False,
)

qwen = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen1.5-4B-Chat",
    task="text-generation",
    max_new_tokens=150,
    do_sample=False,
)

llm = prompt_qwen | qwen

llm2 = prompt_llama | llama

# llm = prompt | CustomLLM(repo_id="Qwen/Qwen-VL-Chat", model_type='text-generation', api_token=API_TOKEN, max_new_tokens=150).bind(stop=['<|im_end|>'])


@app.get("/")
def greet_json():
    return {"Hello": "World!"}


@app.post("/conversation")
async def conversation(data : ConversationPost):
    return {"output":llm.invoke({"question":data.question})}


@app.post("/conversation2")
async def conversation2(data : ConversationPost):
    return {"output":llm2.invoke({"question":data.question})}


@app.post("/inference")
async def inference(data : InferencePost):
    if data.with_template == 'llama':
        out = llm2.invoke(data.question)
    elif data.with_template == 'qwen':
        out = llm.invoke(data.question)
    else:
        out = llama.invoke(data.question)
    return {"output":out}