Spaces:

Condense-AI
/

Fast-PDF-Chat

Sleeping

toilaluan commited on Dec 19, 2024

Commit

3ef1793

1 Parent(s): ea219dd

update

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from gradio_pdf import PDF
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from pathlib import Path
 from markitdown import MarkItDown
 from utils import generate_answer, get_condense_kv_cache
@@ -10,18 +10,23 @@ import torch
 MID = MarkItDown()
 MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
-MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
 MAX_CHARS_TO_COMPRESS = 15000
 @torch.no_grad()
-@spaces.GPU
 def get_model_kv_cache(context_ids):
     context_ids = context_ids.to("cuda")
     past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
     return past_key_values
 def inference(question: str, doc_path: str, use_turbo=True) -> str:
     question = "\n\nBased on above informations, answer this question: " + question
     doc_md = MID.convert(doc_path)
     doc_text = doc_md.text_content[:20000]
@@ -51,7 +56,6 @@ demo = gr.Interface(
     inference,
     [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
     gr.Textbox(),
-    examples=[["What is the total gross worth?", "phi-4.pdf"]]
 )
 if __name__ == "__main__":

 import gradio as gr
 from gradio_pdf import PDF
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
 from pathlib import Path
 from markitdown import MarkItDown
 from utils import generate_answer, get_condense_kv_cache
 MID = MarkItDown()
 MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
+MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
 MAX_CHARS_TO_COMPRESS = 15000
 @torch.no_grad()
 def get_model_kv_cache(context_ids):
     context_ids = context_ids.to("cuda")
     past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
+    kv_cache = DynamicCache.from_legacy_cache(
+        past_key_values
+    )
     return past_key_values
+@spaces.GPU
 def inference(question: str, doc_path: str, use_turbo=True) -> str:
+    MODEL.to("cuda")
     question = "\n\nBased on above informations, answer this question: " + question
     doc_md = MID.convert(doc_path)
     doc_text = doc_md.text_content[:20000]
     inference,
     [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
     gr.Textbox(),
 )
 if __name__ == "__main__":

utils.py CHANGED Viewed

@@ -10,7 +10,6 @@ import spaces
 os.makedirs("tmp", exist_ok=True)
-@spaces.GPU
 def generate_answer(
     model, tokenizer, question_ids, cache, context_length, max_new_tokens
 ):

 os.makedirs("tmp", exist_ok=True)
 def generate_answer(
     model, tokenizer, question_ids, cache, context_length, max_new_tokens
 ):