allenpark commited on
Commit
5b6755d
·
verified ·
1 Parent(s): b96a541

Update backend to use Lepton API rather than created HF model

Browse files
Files changed (1) hide show
  1. app.py +32 -18
app.py CHANGED
@@ -3,16 +3,25 @@ import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import gradio as gr
5
  import spaces
 
6
 
7
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
8
  # if torch.cuda.is_available():
9
  # device = "cuda:0"
10
  # else:
11
  # device = "cpu"
12
 
13
- tokenizer = AutoTokenizer.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct")
14
- model = AutoModelForCausalLM.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct", torch_dtype=torch.float16, device_map="auto")
15
- model.gradient_checkpointing_enable()
 
 
 
 
 
 
 
16
 
17
  # def load_model_and_tokenizer(model_choice):
18
  # if model_choice == "Patronus Lynx 8B":
@@ -76,27 +85,32 @@ HEADER = """
76
  **Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
77
  """
78
 
79
- @spaces.GPU()
80
  # def model_call(question, document, answer, tokenizer, model):
81
  def model_call(question, document, answer):
82
  # device = next(model.parameters()).device
83
  NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
84
  print("ENTIRE NEW_FORMAT", NEW_FORMAT)
85
- inputs = tokenizer(NEW_FORMAT, return_tensors="pt")
86
- print("INPUTS", inputs)
87
- input_ids = inputs.input_ids
88
- attention_mask = inputs.attention_mask
89
- generate_kwargs = dict(
90
- input_ids=input_ids,
91
- do_sample=True,
92
- attention_mask=attention_mask,
93
- pad_token_id=tokenizer.eos_token_id,
94
  )
95
- print("GENERATE_KWARGS", generate_kwargs)
96
- with torch.no_grad():
97
- outputs = model.generate(**generate_kwargs)
98
- print("OUTPUTS", outputs)
99
- generated_text = tokenizer.decode(outputs[0])
 
 
 
 
 
 
 
 
 
 
 
100
  print(generated_text)
101
  return generated_text
102
 
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import gradio as gr
5
  import spaces
6
+ import openai
7
 
8
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
9
+ LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
10
  # if torch.cuda.is_available():
11
  # device = "cuda:0"
12
  # else:
13
  # device = "cpu"
14
 
15
+ # Set up client to call inference
16
+ client=openai.OpenAI(
17
+ base_url="https://yb15a7dy-lynx-70b.tin.lepton.run/api/v1/",
18
+ api_key=api_token
19
+ )
20
+
21
+ # Create own model
22
+ # tokenizer = AutoTokenizer.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct")
23
+ # model = AutoModelForCausalLM.from_pretrained("PatronusAI/Llama-3-Patronus-Lynx-8B-Instruct", torch_dtype=torch.float16, device_map="auto")
24
+ # model.gradient_checkpointing_enable()
25
 
26
  # def load_model_and_tokenizer(model_choice):
27
  # if model_choice == "Patronus Lynx 8B":
 
85
  **Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score.
86
  """
87
 
88
+ # @spaces.GPU()
89
  # def model_call(question, document, answer, tokenizer, model):
90
  def model_call(question, document, answer):
91
  # device = next(model.parameters()).device
92
  NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer)
93
  print("ENTIRE NEW_FORMAT", NEW_FORMAT)
94
+ response = client.completions.create(
95
+ model="gpt-3.5-turbo-instruct",
96
+ prompt=NEW_FORMAT
 
 
 
 
 
 
97
  )
98
+ generated_text = response['choices'][0]['text']
99
+ # inputs = tokenizer(NEW_FORMAT, return_tensors="pt")
100
+ # print("INPUTS", inputs)
101
+ # input_ids = inputs.input_ids
102
+ # attention_mask = inputs.attention_mask
103
+ # generate_kwargs = dict(
104
+ # input_ids=input_ids,
105
+ # do_sample=True,
106
+ # attention_mask=attention_mask,
107
+ # pad_token_id=tokenizer.eos_token_id,
108
+ # )
109
+ # print("GENERATE_KWARGS", generate_kwargs)
110
+ # with torch.no_grad():
111
+ # outputs = model.generate(**generate_kwargs)
112
+ # print("OUTPUTS", outputs)
113
+ # generated_text = tokenizer.decode(outputs[0])
114
  print(generated_text)
115
  return generated_text
116