Spaces:
Sleeping
Sleeping
Update backend to use Lepton API rather than created HF model
Browse files
app.py
CHANGED
@@ -3,16 +3,25 @@ import torch
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
import gradio as gr
|
5 |
import spaces
|
|
|
6 |
|
7 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
8 |
# if torch.cuda.is_available():
|
9 |
# device = "cuda:0"
|
10 |
# else:
|
11 |
# device = "cpu"
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# def load_model_and_tokenizer(model_choice):
|
18 |
# if model_choice == "Patronus Lynx 8B":
|
@@ -76,27 +85,32 @@ HEADER = """
|
|
76 |
**Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
|
77 |
"""
|
78 |
|
79 |
-
@spaces.GPU()
|
80 |
# def model_call(question, document, answer, tokenizer, model):
|
81 |
def model_call(question, document, answer):
|
82 |
# device = next(model.parameters()).device
|
83 |
NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
|
84 |
print("ENTIRE NEW_FORMAT", NEW_FORMAT)
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
attention_mask = inputs.attention_mask
|
89 |
-
generate_kwargs = dict(
|
90 |
-
input_ids=input_ids,
|
91 |
-
do_sample=True,
|
92 |
-
attention_mask=attention_mask,
|
93 |
-
pad_token_id=tokenizer.eos_token_id,
|
94 |
)
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
print(generated_text)
|
101 |
return generated_text
|
102 |
|
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
import gradio as gr
|
5 |
import spaces
|
6 |
+
import openai
|
7 |
|
8 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
9 |
+
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
|
10 |
# if torch.cuda.is_available():
|
11 |
# device = "cuda:0"
|
12 |
# else:
|
13 |
# device = "cpu"
|
14 |
|
15 |
+
# Set up client to call inference
|
16 |
+
client=openai.OpenAI(
|
17 |
+
base_url="https://yb15a7dy-lynx-70b.tin.lepton.run/api/v1/",
|
18 |
+
api_key=api_token
|
19 |
+
)
|
20 |
+
|
21 |
+
# Create own model
|
22 |
+
# tokenizer = AutoTokenizer.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct")
|
23 |
+
# model = AutoModelForCausalLM.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct", torch_dtype=torch.float16, device_map="auto")
|
24 |
+
# model.gradient_checkpointing_enable()
|
25 |
|
26 |
# def load_model_and_tokenizer(model_choice):
|
27 |
# if model_choice == "Patronus Lynx 8B":
|
|
|
85 |
**Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
|
86 |
"""
|
87 |
|
88 |
+
# @spaces.GPU()
|
89 |
# def model_call(question, document, answer, tokenizer, model):
|
90 |
def model_call(question, document, answer):
|
91 |
# device = next(model.parameters()).device
|
92 |
NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
|
93 |
print("ENTIRE NEW_FORMAT", NEW_FORMAT)
|
94 |
+
response = client.completions.create(
|
95 |
+
model="gpt-3.5-turbo-instruct",
|
96 |
+
prompt=NEW_FORMAT
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
)
|
98 |
+
generated_text = response['choices'][0]['text']
|
99 |
+
# inputs = tokenizer(NEW_FORMAT, return_tensors="pt")
|
100 |
+
# print("INPUTS", inputs)
|
101 |
+
# input_ids = inputs.input_ids
|
102 |
+
# attention_mask = inputs.attention_mask
|
103 |
+
# generate_kwargs = dict(
|
104 |
+
# input_ids=input_ids,
|
105 |
+
# do_sample=True,
|
106 |
+
# attention_mask=attention_mask,
|
107 |
+
# pad_token_id=tokenizer.eos_token_id,
|
108 |
+
# )
|
109 |
+
# print("GENERATE_KWARGS", generate_kwargs)
|
110 |
+
# with torch.no_grad():
|
111 |
+
# outputs = model.generate(**generate_kwargs)
|
112 |
+
# print("OUTPUTS", outputs)
|
113 |
+
# generated_text = tokenizer.decode(outputs[0])
|
114 |
print(generated_text)
|
115 |
return generated_text
|
116 |
|