toilaluan commited on
Commit
ea219dd
·
1 Parent(s): 1040659
Files changed (3) hide show
  1. app.py +55 -4
  2. requirements.txt +4 -0
  3. utils.py +127 -0
app.py CHANGED
@@ -1,7 +1,58 @@
1
  import gradio as gr
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gradio_pdf import PDF
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from pathlib import Path
5
+ from markitdown import MarkItDown
6
+ from utils import generate_answer, get_condense_kv_cache
7
+ import spaces
8
+ import torch
9
 
 
 
10
 
11
+ MID = MarkItDown()
12
+ MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
13
+ MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
14
+ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ MAX_CHARS_TO_COMPRESS = 15000
16
+
17
+ @torch.no_grad()
18
+ @spaces.GPU
19
+ def get_model_kv_cache(context_ids):
20
+ context_ids = context_ids.to("cuda")
21
+ past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
22
+ return past_key_values
23
+
24
+ def inference(question: str, doc_path: str, use_turbo=True) -> str:
25
+ question = "\n\nBased on above informations, answer this question: " + question
26
+ doc_md = MID.convert(doc_path)
27
+ doc_text = doc_md.text_content[:20000]
28
+ to_compress_doc = "<s> [INST] " + doc_text[:MAX_CHARS_TO_COMPRESS]
29
+ remaining_doc_and_question_prompt = doc_text[MAX_CHARS_TO_COMPRESS:] + question + " [/INST] "
30
+ prompt_ids = TOKENIZER.encode(remaining_doc_and_question_prompt, add_special_tokens=False, return_tensors="pt")
31
+ context_ids = TOKENIZER.encode(to_compress_doc, add_special_tokens=False, return_tensors="pt")
32
+ context_length = context_ids.shape[1]
33
+ if use_turbo:
34
+ print("turbo-mode-on")
35
+ kv_cache = get_condense_kv_cache(to_compress_doc)
36
+ kv_cache = kv_cache.to("cuda")
37
+ else:
38
+ print("turbo-mode-off")
39
+ kv_cache = get_model_kv_cache(context_ids)
40
+
41
+ print("kv-length", kv_cache.get_seq_length())
42
+
43
+ answer = generate_answer(MODEL, TOKENIZER, prompt_ids, kv_cache, context_length, 128)
44
+ print(answer)
45
+ return answer
46
+
47
+
48
+
49
+
50
+ demo = gr.Interface(
51
+ inference,
52
+ [gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
53
+ gr.Textbox(),
54
+ examples=[["What is the total gross worth?", "phi-4.pdf"]]
55
+ )
56
+
57
+ if __name__ == "__main__":
58
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ markitdown
2
+ transformers
3
+ gradio_pdf
4
+ hf_transfer
utils.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import requests
4
+ import hf_transfer
5
+ import numpy as np
6
+ import io
7
+ from transformers import DynamicCache
8
+ import os
9
+ import spaces
10
+
11
+ os.makedirs("tmp", exist_ok=True)
12
+
13
+ @spaces.GPU
14
+ def generate_answer(
15
+ model, tokenizer, question_ids, cache, context_length, max_new_tokens
16
+ ):
17
+ """
18
+ Generate an answer to a question using greedy decoding.
19
+
20
+ Parameters:
21
+ model: Model instance
22
+ tokenizer: Tokenizer instance
23
+ question_ids (torch.Tensor): Tokenized question.
24
+ cache (DynamicCache): Key-value cache.
25
+ context_length (int): Length of the context.
26
+ max_new_tokens (int): Max number of tokens to generate.
27
+
28
+ Returns:
29
+ str: Generated answer.
30
+ """
31
+ question_ids = question_ids.to("cuda")
32
+ cache_seq_lengths = [
33
+ cache.get_seq_length(layer_idx) for layer_idx in range(len(cache))
34
+ ]
35
+
36
+ position_ids = torch.arange(
37
+ context_length, context_length + question_ids.shape[1], device=model.device
38
+ ).unsqueeze(0)
39
+
40
+ outputs = model(
41
+ input_ids=question_ids.to(model.device),
42
+ past_key_values=cache,
43
+ position_ids=position_ids,
44
+ num_logits_to_keep=1,
45
+ )
46
+
47
+ position_ids = position_ids[:, -1:] + 1
48
+ generated_ids = [outputs.logits[0, -1].argmax()]
49
+
50
+ for _ in range(max_new_tokens - 1):
51
+ outputs = model(
52
+ input_ids=generated_ids[-1].unsqueeze(0).unsqueeze(0),
53
+ past_key_values=cache,
54
+ position_ids=position_ids + _,
55
+ )
56
+ new_id = outputs.logits[0, -1].argmax()
57
+ generated_ids.append(new_id)
58
+ if new_id.item() in model.generation_config.eos_token_id:
59
+ break
60
+
61
+ answer = tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
62
+
63
+ cache.key_cache = [
64
+ key[:, :, :c] for key, c in zip(cache.key_cache, cache_seq_lengths)
65
+ ]
66
+ cache.value_cache = [
67
+ value[:, :, :c] for value, c in zip(cache.value_cache, cache_seq_lengths)
68
+ ]
69
+
70
+ return answer
71
+
72
+ def get_condense_kv_cache(context: str):
73
+ url = "https://ncs-client.condenses.ai/api/organic"
74
+ payload = {
75
+ "tier": "research",
76
+ "target_model": "mistralai/Mistral-7B-Instruct-v0.2",
77
+ "context": context,
78
+ "top_incentive": 0.1
79
+ }
80
+ headers = {
81
+ "accept": "application/json",
82
+ "content-type": "application/json",
83
+ "user-api-key": os.getenv("CONDENSE_API_KEY"),
84
+ }
85
+ response = requests.post(url, json=payload, headers=headers).json()
86
+ print(response)
87
+ numpy_kv_cache, error = load_npy_from_url(response["compressed_kv_url"])
88
+ kv_cache = DynamicCache.from_legacy_cache(
89
+ torch.from_numpy(numpy_kv_cache).to("cuda").to(torch.bfloat16)
90
+ )
91
+ return kv_cache
92
+
93
+ def load_npy_from_url(url, max_size_mb=1024):
94
+ """
95
+ Load a `.npy` file from a URL using hf_transfer.
96
+
97
+ Parameters:
98
+ url (str): URL of the `.npy` file.
99
+ max_size_mb (int): Max file size in megabytes.
100
+
101
+ Returns:
102
+ tuple: (Loaded NumPy array, Error message).
103
+ """
104
+ try:
105
+ with httpx.Client() as client:
106
+ response = client.head(url)
107
+ if response.status_code != 200:
108
+ return None, f"Failed to fetch file info: HTTP {response.status_code}"
109
+
110
+ content_length = int(response.headers.get("content-length", 0))
111
+ if content_length > max_size_mb * 1024 * 1024:
112
+ return None, f"File too large: {content_length / (1024 * 1024):.1f}MB exceeds {max_size_mb}MB limit"
113
+
114
+ filename = os.path.join("tmp", url.split("/")[-1])
115
+ with tqdm(total=content_length, unit="B", unit_scale=True, desc="Downloading") as pbar:
116
+ hf_transfer.download(
117
+ url=url, filename=filename, chunk_size=1024 * 1024, callback=pbar.update
118
+ )
119
+
120
+ with open(filename, "rb") as f:
121
+ buffer = io.BytesIO(f.read())
122
+ data = np.load(buffer)
123
+
124
+ os.remove(filename)
125
+ return data, ""
126
+ except Exception as e:
127
+ return None, str(e)