Spaces:

Condense-AI
/

Fast-PDF-Chat

Sleeping

App Files Files Community

toilaluan commited on Dec 19, 2024

Commit

ea219dd

1 Parent(s): 1040659

update

Browse files

Files changed (3) hide show

app.py +55 -4
requirements.txt +4 -0
utils.py +127 -0

app.py CHANGED Viewed

@@ -1,7 +1,58 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from gradio_pdf import PDF
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from pathlib import Path
+from markitdown import MarkItDown
+from utils import generate_answer, get_condense_kv_cache
+import spaces
+import torch
+MID = MarkItDown()
+MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
+MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
+MAX_CHARS_TO_COMPRESS = 15000
+@torch.no_grad()
+@spaces.GPU
+def get_model_kv_cache(context_ids):
+    context_ids = context_ids.to("cuda")
+    past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
+    return past_key_values
+def inference(question: str, doc_path: str, use_turbo=True) -> str:
+    question = "\n\nBased on above informations, answer this question: " + question
+    doc_md = MID.convert(doc_path)
+    doc_text = doc_md.text_content[:20000]
+    to_compress_doc = "<s> [INST] " + doc_text[:MAX_CHARS_TO_COMPRESS]
+    remaining_doc_and_question_prompt = doc_text[MAX_CHARS_TO_COMPRESS:] + question + " [/INST] "
+    prompt_ids = TOKENIZER.encode(remaining_doc_and_question_prompt, add_special_tokens=False, return_tensors="pt")
+    context_ids = TOKENIZER.encode(to_compress_doc, add_special_tokens=False, return_tensors="pt")
+    context_length = context_ids.shape[1]
+    if use_turbo:
+        print("turbo-mode-on")
+        kv_cache = get_condense_kv_cache(to_compress_doc)
+        kv_cache = kv_cache.to("cuda")
+    else:
+        print("turbo-mode-off")
+        kv_cache = get_model_kv_cache(context_ids)
+    print("kv-length", kv_cache.get_seq_length())
+    answer = generate_answer(MODEL, TOKENIZER, prompt_ids, kv_cache, context_length, 128)
+    print(answer)
+    return answer
+demo = gr.Interface(
+    inference,
+    [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
+    gr.Textbox(),
+    examples=[["What is the total gross worth?", "phi-4.pdf"]]
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+markitdown
+transformers
+gradio_pdf
+hf_transfer

utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import os
+import requests
+import hf_transfer
+import numpy as np
+import io
+from transformers import DynamicCache
+import os
+import spaces
+os.makedirs("tmp", exist_ok=True)
+@spaces.GPU
+def generate_answer(
+    model, tokenizer, question_ids, cache, context_length, max_new_tokens
+):
+    """
+    Generate an answer to a question using greedy decoding.
+    Parameters:
+        model: Model instance
+        tokenizer: Tokenizer instance
+        question_ids (torch.Tensor): Tokenized question.
+        cache (DynamicCache): Key-value cache.
+        context_length (int): Length of the context.
+        max_new_tokens (int): Max number of tokens to generate.
+    Returns:
+        str: Generated answer.
+    """
+    question_ids = question_ids.to("cuda")
+    cache_seq_lengths = [
+        cache.get_seq_length(layer_idx) for layer_idx in range(len(cache))
+    ]
+    position_ids = torch.arange(
+        context_length, context_length + question_ids.shape[1], device=model.device
+    ).unsqueeze(0)
+    outputs = model(
+        input_ids=question_ids.to(model.device),
+        past_key_values=cache,
+        position_ids=position_ids,
+        num_logits_to_keep=1,
+    )
+    position_ids = position_ids[:, -1:] + 1
+    generated_ids = [outputs.logits[0, -1].argmax()]
+    for _ in range(max_new_tokens - 1):
+        outputs = model(
+            input_ids=generated_ids[-1].unsqueeze(0).unsqueeze(0),
+            past_key_values=cache,
+            position_ids=position_ids + _,
+        )
+        new_id = outputs.logits[0, -1].argmax()
+        generated_ids.append(new_id)
+        if new_id.item() in model.generation_config.eos_token_id:
+            break
+    answer = tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
+    cache.key_cache = [
+        key[:, :, :c] for key, c in zip(cache.key_cache, cache_seq_lengths)
+    ]
+    cache.value_cache = [
+        value[:, :, :c] for value, c in zip(cache.value_cache, cache_seq_lengths)
+    ]
+    return answer
+def get_condense_kv_cache(context: str):
+    url = "https://ncs-client.condenses.ai/api/organic"
+    payload = {
+        "tier": "research",
+        "target_model": "mistralai/Mistral-7B-Instruct-v0.2",
+        "context": context,
+        "top_incentive": 0.1
+    }
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+        "user-api-key": os.getenv("CONDENSE_API_KEY"),
+    }
+    response = requests.post(url, json=payload, headers=headers).json()
+    print(response)
+    numpy_kv_cache, error = load_npy_from_url(response["compressed_kv_url"])
+    kv_cache = DynamicCache.from_legacy_cache(
+        torch.from_numpy(numpy_kv_cache).to("cuda").to(torch.bfloat16)
+    )
+    return kv_cache
+def load_npy_from_url(url, max_size_mb=1024):
+    """
+    Load a `.npy` file from a URL using hf_transfer.
+    Parameters:
+        url (str): URL of the `.npy` file.
+        max_size_mb (int): Max file size in megabytes.
+    Returns:
+        tuple: (Loaded NumPy array, Error message).
+    """
+    try:
+        with httpx.Client() as client:
+            response = client.head(url)
+            if response.status_code != 200:
+                return None, f"Failed to fetch file info: HTTP {response.status_code}"
+            content_length = int(response.headers.get("content-length", 0))
+            if content_length > max_size_mb * 1024 * 1024:
+                return None, f"File too large: {content_length / (1024 * 1024):.1f}MB exceeds {max_size_mb}MB limit"
+        filename = os.path.join("tmp", url.split("/")[-1])
+        with tqdm(total=content_length, unit="B", unit_scale=True, desc="Downloading") as pbar:
+            hf_transfer.download(
+                url=url, filename=filename, chunk_size=1024 * 1024, callback=pbar.update
+            )
+        with open(filename, "rb") as f:
+            buffer = io.BytesIO(f.read())
+            data = np.load(buffer)
+        os.remove(filename)
+        return data, ""
+    except Exception as e:
+        return None, str(e)