toilaluan commited on
Commit
3ef1793
·
1 Parent(s): ea219dd
Files changed (2) hide show
  1. app.py +8 -4
  2. utils.py +0 -1
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from gradio_pdf import PDF
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from pathlib import Path
5
  from markitdown import MarkItDown
6
  from utils import generate_answer, get_condense_kv_cache
@@ -10,18 +10,23 @@ import torch
10
 
11
  MID = MarkItDown()
12
  MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
13
- MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
14
  TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
15
  MAX_CHARS_TO_COMPRESS = 15000
16
 
17
  @torch.no_grad()
18
- @spaces.GPU
19
  def get_model_kv_cache(context_ids):
20
  context_ids = context_ids.to("cuda")
21
  past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
 
 
 
22
  return past_key_values
23
 
 
24
  def inference(question: str, doc_path: str, use_turbo=True) -> str:
 
25
  question = "\n\nBased on above informations, answer this question: " + question
26
  doc_md = MID.convert(doc_path)
27
  doc_text = doc_md.text_content[:20000]
@@ -51,7 +56,6 @@ demo = gr.Interface(
51
  inference,
52
  [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
53
  gr.Textbox(),
54
- examples=[["What is the total gross worth?", "phi-4.pdf"]]
55
  )
56
 
57
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  from gradio_pdf import PDF
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
4
  from pathlib import Path
5
  from markitdown import MarkItDown
6
  from utils import generate_answer, get_condense_kv_cache
 
10
 
11
  MID = MarkItDown()
12
  MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
13
+ MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
14
  TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
15
  MAX_CHARS_TO_COMPRESS = 15000
16
 
17
  @torch.no_grad()
18
+
19
  def get_model_kv_cache(context_ids):
20
  context_ids = context_ids.to("cuda")
21
  past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
22
+ kv_cache = DynamicCache.from_legacy_cache(
23
+ past_key_values
24
+ )
25
  return past_key_values
26
 
27
+ @spaces.GPU
28
  def inference(question: str, doc_path: str, use_turbo=True) -> str:
29
+ MODEL.to("cuda")
30
  question = "\n\nBased on above informations, answer this question: " + question
31
  doc_md = MID.convert(doc_path)
32
  doc_text = doc_md.text_content[:20000]
 
56
  inference,
57
  [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
58
  gr.Textbox(),
 
59
  )
60
 
61
  if __name__ == "__main__":
utils.py CHANGED
@@ -10,7 +10,6 @@ import spaces
10
 
11
  os.makedirs("tmp", exist_ok=True)
12
 
13
- @spaces.GPU
14
  def generate_answer(
15
  model, tokenizer, question_ids, cache, context_length, max_new_tokens
16
  ):
 
10
 
11
  os.makedirs("tmp", exist_ok=True)
12
 
 
13
  def generate_answer(
14
  model, tokenizer, question_ids, cache, context_length, max_new_tokens
15
  ):