File size: 4,474 Bytes
0fc7538
 
091d4d8
 
0fc7538
ec77fcb
0fc7538
 
091d4d8
d42d1b9
091d4d8
 
f63bd36
091d4d8
 
 
 
 
 
 
0fc7538
091d4d8
 
 
 
0fc7538
 
091d4d8
 
 
 
0fc7538
 
 
 
 
 
 
 
be00375
0fc7538
 
 
 
2ae8beb
0fc7538
f63bd36
0fc7538
 
 
 
091d4d8
 
 
 
0fc7538
091d4d8
d42d1b9
 
be00375
d42d1b9
 
0fc7538
091d4d8
0fc7538
 
 
2ae8beb
0fc7538
f63bd36
0fc7538
 
 
 
091d4d8
 
 
 
0fc7538
091d4d8
0fc7538
728e771
 
091d4d8
 
 
 
 
 
 
f63bd36
091d4d8
 
 
 
 
 
 
 
 
 
 
 
 
 
728e771
be00375
4940b5d
 
728e771
be00375
 
15a6b6b
091d4d8
 
 
15a6b6b
ef52b87
091d4d8
 
728e771
 
091d4d8
15a6b6b
 
 
 
 
 
 
 
d42d1b9
15a6b6b
 
 
 
 
 
 
 
 
 
 
fc6ce14
 
15a6b6b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from fastapi import FastAPI
import torch
import os
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
device = "cpu"

access_token = os.getenv("access_token")
privateurl = os.getenv("privateurl")

tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

llm1 = Llama.from_pretrained(
    repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF",
    filename="*q8_0.gguf",
    verbose=False
)

llm2 = Llama.from_pretrained(
    repo_id="NexaAIDev/gemma-2-2b-it-GGUF",
    filename="*q4_K_S.gguf",
    verbose=False
)

llm3 = Llama.from_pretrained(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="*q4.gguf",
    verbose=False
)

app = FastAPI()

@app.get("/")
async def read_root():
    return {"Hello": "World!"}

def modelResp1(cookie, target, token, prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer1.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm1(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']
    headers['Cookie'] = f"{cookie}"
    payload['token'] = f"{token}"
    payload['target'] = f"{target}"
    payload['content'] = response
    requests.post(privateurl, headers=headers, data=payload)

def modelResp2(prompt):
    messages = [
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer2.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm2(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']

    return response
    
def modelResp3(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
        {"role": "user", "content": f"{prompt}"}
    ]
    text = tokenizer3.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    output = llm2(
      text,
      max_tokens=64,  # Generate up to 256 tokens
      echo=False,  # Whether to echo the prompt
    )
    response = output['choices'][0]['text']

    return response
    
@app.post("/modelapi1")
async def modelApi(data: dict):
    target = data.get("target_id")
    cookie = data.get("Cookie")
    token = data.get("token")
    prompt = data.get("prompt")
    modelResp1(cookie, target, token, prompt)
    return {"Hello": "World!"}
    
@app.post("/modelapi2")
async def modelApi(data: dict):
    prompt = data.get("prompt")
    #response = modelResp2(prompt)
    return {"Hello": "World!"}
    
@app.post("/modelapi3")
async def modelApi1(data: dict):
    prompt = data.get("prompt")
    response = modelResp3(prompt)
    return response


headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': '',
    'Sec-Ch-Ua': '"Opera";v="95", "Chromium";v="109", "Not;A=Brand";v="24"',
    'Sec-Ch-Ua-Mobile': '?0',
    'Sec-Ch-Ua-Platform': '"Windows"',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0',
    'X-Requested-With': 'XMLHttpRequest'
}

payload = {
    'target': '',
    'content': '',
    'token': ''
}