Spaces:

PatronusAI
/

LynxDemo

Sleeping

App Files Files Community

allenpark commited on Jul 29, 2024

Commit

5b6755d

verified ·

1 Parent(s): b96a541

Update backend to use Lepton API rather than created HF model

Browse files

Files changed (1) hide show

app.py +32 -18

app.py CHANGED Viewed

@@ -3,16 +3,25 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 import spaces
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 # if torch.cuda.is_available():
 #     device = "cuda:0"
 # else:
 #     device = "cpu"
-tokenizer = AutoTokenizer.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct")
-model = AutoModelForCausalLM.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct", torch_dtype=torch.float16, device_map="auto")
-model.gradient_checkpointing_enable()
 # def load_model_and_tokenizer(model_choice):
 #     if model_choice == "Patronus Lynx 8B":
@@ -76,27 +85,32 @@ HEADER = """
 **Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
 """
-@spaces.GPU()
 # def model_call(question, document, answer, tokenizer, model):
 def model_call(question, document, answer):
     # device = next(model.parameters()).device
     NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
     print("ENTIRE NEW_FORMAT", NEW_FORMAT)
-    inputs = tokenizer(NEW_FORMAT, return_tensors="pt")
-    print("INPUTS", inputs)
-    input_ids = inputs.input_ids
-    attention_mask = inputs.attention_mask
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        do_sample=True,
-        attention_mask=attention_mask,
-        pad_token_id=tokenizer.eos_token_id,
     )
-    print("GENERATE_KWARGS", generate_kwargs)
-    with torch.no_grad():
-        outputs = model.generate(**generate_kwargs)
-    print("OUTPUTS", outputs)
-    generated_text = tokenizer.decode(outputs[0])
     print(generated_text)
     return generated_text

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 import spaces
+import openai
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
 # if torch.cuda.is_available():
 #     device = "cuda:0"
 # else:
 #     device = "cpu"
+# Set up client to call inference
+client=openai.OpenAI(
+    base_url="https://yb15a7dy-lynx-70b.tin.lepton.run/api/v1/",
+    api_key=api_token
+)
+# Create own model
+# tokenizer = AutoTokenizer.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct")
+# model = AutoModelForCausalLM.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct", torch_dtype=torch.float16, device_map="auto")
+# model.gradient_checkpointing_enable()
 # def load_model_and_tokenizer(model_choice):
 #     if model_choice == "Patronus Lynx 8B":
 **Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
 """
+# @spaces.GPU()
 # def model_call(question, document, answer, tokenizer, model):
 def model_call(question, document, answer):
     # device = next(model.parameters()).device
     NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
     print("ENTIRE NEW_FORMAT", NEW_FORMAT)
+    response = client.completions.create(
+        model="gpt-3.5-turbo-instruct",
+        prompt=NEW_FORMAT
     )
+    generated_text = response['choices'][0]['text']
+    # inputs = tokenizer(NEW_FORMAT, return_tensors="pt")
+    # print("INPUTS", inputs)
+    # input_ids = inputs.input_ids
+    # attention_mask = inputs.attention_mask
+    # generate_kwargs = dict(
+    #     input_ids=input_ids,
+    #     do_sample=True,
+    #     attention_mask=attention_mask,
+    #     pad_token_id=tokenizer.eos_token_id,
+    # )
+    # print("GENERATE_KWARGS", generate_kwargs)
+    # with torch.no_grad():
+    #     outputs = model.generate(**generate_kwargs)
+    # print("OUTPUTS", outputs)
+    # generated_text = tokenizer.decode(outputs[0])
     print(generated_text)
     return generated_text