Spaces:
Sleeping
Sleeping
update
Browse files- app.py +55 -4
- requirements.txt +4 -0
- utils.py +127 -0
app.py
CHANGED
@@ -1,7 +1,58 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def greet(name):
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_pdf import PDF
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
+
from pathlib import Path
|
5 |
+
from markitdown import MarkItDown
|
6 |
+
from utils import generate_answer, get_condense_kv_cache
|
7 |
+
import spaces
|
8 |
+
import torch
|
9 |
|
|
|
|
|
10 |
|
11 |
+
MID = MarkItDown()
|
12 |
+
MODEL_ID = "unsloth/Mistral-7B-Instruct-v0.2"
|
13 |
+
MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16).to("cuda")
|
14 |
+
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
|
15 |
+
MAX_CHARS_TO_COMPRESS = 15000
|
16 |
+
|
17 |
+
@torch.no_grad()
|
18 |
+
@spaces.GPU
|
19 |
+
def get_model_kv_cache(context_ids):
|
20 |
+
context_ids = context_ids.to("cuda")
|
21 |
+
past_key_values = MODEL(context_ids, num_logits_to_keep=1).past_key_values
|
22 |
+
return past_key_values
|
23 |
+
|
24 |
+
def inference(question: str, doc_path: str, use_turbo=True) -> str:
|
25 |
+
question = "\n\nBased on above informations, answer this question: " + question
|
26 |
+
doc_md = MID.convert(doc_path)
|
27 |
+
doc_text = doc_md.text_content[:20000]
|
28 |
+
to_compress_doc = "<s> [INST] " + doc_text[:MAX_CHARS_TO_COMPRESS]
|
29 |
+
remaining_doc_and_question_prompt = doc_text[MAX_CHARS_TO_COMPRESS:] + question + " [/INST] "
|
30 |
+
prompt_ids = TOKENIZER.encode(remaining_doc_and_question_prompt, add_special_tokens=False, return_tensors="pt")
|
31 |
+
context_ids = TOKENIZER.encode(to_compress_doc, add_special_tokens=False, return_tensors="pt")
|
32 |
+
context_length = context_ids.shape[1]
|
33 |
+
if use_turbo:
|
34 |
+
print("turbo-mode-on")
|
35 |
+
kv_cache = get_condense_kv_cache(to_compress_doc)
|
36 |
+
kv_cache = kv_cache.to("cuda")
|
37 |
+
else:
|
38 |
+
print("turbo-mode-off")
|
39 |
+
kv_cache = get_model_kv_cache(context_ids)
|
40 |
+
|
41 |
+
print("kv-length", kv_cache.get_seq_length())
|
42 |
+
|
43 |
+
answer = generate_answer(MODEL, TOKENIZER, prompt_ids, kv_cache, context_length, 128)
|
44 |
+
print(answer)
|
45 |
+
return answer
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
demo = gr.Interface(
|
51 |
+
inference,
|
52 |
+
[gr.Textbox(label="Question"), PDF(label="Document"), gr.Checkbox(label="Turbo Bittensor", info="Use Subnet 47 API for Prefilling")],
|
53 |
+
gr.Textbox(),
|
54 |
+
examples=[["What is the total gross worth?", "phi-4.pdf"]]
|
55 |
+
)
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
demo.launch(share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
markitdown
|
2 |
+
transformers
|
3 |
+
gradio_pdf
|
4 |
+
hf_transfer
|
utils.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import requests
|
4 |
+
import hf_transfer
|
5 |
+
import numpy as np
|
6 |
+
import io
|
7 |
+
from transformers import DynamicCache
|
8 |
+
import os
|
9 |
+
import spaces
|
10 |
+
|
11 |
+
os.makedirs("tmp", exist_ok=True)
|
12 |
+
|
13 |
+
@spaces.GPU
|
14 |
+
def generate_answer(
|
15 |
+
model, tokenizer, question_ids, cache, context_length, max_new_tokens
|
16 |
+
):
|
17 |
+
"""
|
18 |
+
Generate an answer to a question using greedy decoding.
|
19 |
+
|
20 |
+
Parameters:
|
21 |
+
model: Model instance
|
22 |
+
tokenizer: Tokenizer instance
|
23 |
+
question_ids (torch.Tensor): Tokenized question.
|
24 |
+
cache (DynamicCache): Key-value cache.
|
25 |
+
context_length (int): Length of the context.
|
26 |
+
max_new_tokens (int): Max number of tokens to generate.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
str: Generated answer.
|
30 |
+
"""
|
31 |
+
question_ids = question_ids.to("cuda")
|
32 |
+
cache_seq_lengths = [
|
33 |
+
cache.get_seq_length(layer_idx) for layer_idx in range(len(cache))
|
34 |
+
]
|
35 |
+
|
36 |
+
position_ids = torch.arange(
|
37 |
+
context_length, context_length + question_ids.shape[1], device=model.device
|
38 |
+
).unsqueeze(0)
|
39 |
+
|
40 |
+
outputs = model(
|
41 |
+
input_ids=question_ids.to(model.device),
|
42 |
+
past_key_values=cache,
|
43 |
+
position_ids=position_ids,
|
44 |
+
num_logits_to_keep=1,
|
45 |
+
)
|
46 |
+
|
47 |
+
position_ids = position_ids[:, -1:] + 1
|
48 |
+
generated_ids = [outputs.logits[0, -1].argmax()]
|
49 |
+
|
50 |
+
for _ in range(max_new_tokens - 1):
|
51 |
+
outputs = model(
|
52 |
+
input_ids=generated_ids[-1].unsqueeze(0).unsqueeze(0),
|
53 |
+
past_key_values=cache,
|
54 |
+
position_ids=position_ids + _,
|
55 |
+
)
|
56 |
+
new_id = outputs.logits[0, -1].argmax()
|
57 |
+
generated_ids.append(new_id)
|
58 |
+
if new_id.item() in model.generation_config.eos_token_id:
|
59 |
+
break
|
60 |
+
|
61 |
+
answer = tokenizer.decode(torch.stack(generated_ids), skip_special_tokens=True)
|
62 |
+
|
63 |
+
cache.key_cache = [
|
64 |
+
key[:, :, :c] for key, c in zip(cache.key_cache, cache_seq_lengths)
|
65 |
+
]
|
66 |
+
cache.value_cache = [
|
67 |
+
value[:, :, :c] for value, c in zip(cache.value_cache, cache_seq_lengths)
|
68 |
+
]
|
69 |
+
|
70 |
+
return answer
|
71 |
+
|
72 |
+
def get_condense_kv_cache(context: str):
|
73 |
+
url = "https://ncs-client.condenses.ai/api/organic"
|
74 |
+
payload = {
|
75 |
+
"tier": "research",
|
76 |
+
"target_model": "mistralai/Mistral-7B-Instruct-v0.2",
|
77 |
+
"context": context,
|
78 |
+
"top_incentive": 0.1
|
79 |
+
}
|
80 |
+
headers = {
|
81 |
+
"accept": "application/json",
|
82 |
+
"content-type": "application/json",
|
83 |
+
"user-api-key": os.getenv("CONDENSE_API_KEY"),
|
84 |
+
}
|
85 |
+
response = requests.post(url, json=payload, headers=headers).json()
|
86 |
+
print(response)
|
87 |
+
numpy_kv_cache, error = load_npy_from_url(response["compressed_kv_url"])
|
88 |
+
kv_cache = DynamicCache.from_legacy_cache(
|
89 |
+
torch.from_numpy(numpy_kv_cache).to("cuda").to(torch.bfloat16)
|
90 |
+
)
|
91 |
+
return kv_cache
|
92 |
+
|
93 |
+
def load_npy_from_url(url, max_size_mb=1024):
|
94 |
+
"""
|
95 |
+
Load a `.npy` file from a URL using hf_transfer.
|
96 |
+
|
97 |
+
Parameters:
|
98 |
+
url (str): URL of the `.npy` file.
|
99 |
+
max_size_mb (int): Max file size in megabytes.
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
tuple: (Loaded NumPy array, Error message).
|
103 |
+
"""
|
104 |
+
try:
|
105 |
+
with httpx.Client() as client:
|
106 |
+
response = client.head(url)
|
107 |
+
if response.status_code != 200:
|
108 |
+
return None, f"Failed to fetch file info: HTTP {response.status_code}"
|
109 |
+
|
110 |
+
content_length = int(response.headers.get("content-length", 0))
|
111 |
+
if content_length > max_size_mb * 1024 * 1024:
|
112 |
+
return None, f"File too large: {content_length / (1024 * 1024):.1f}MB exceeds {max_size_mb}MB limit"
|
113 |
+
|
114 |
+
filename = os.path.join("tmp", url.split("/")[-1])
|
115 |
+
with tqdm(total=content_length, unit="B", unit_scale=True, desc="Downloading") as pbar:
|
116 |
+
hf_transfer.download(
|
117 |
+
url=url, filename=filename, chunk_size=1024 * 1024, callback=pbar.update
|
118 |
+
)
|
119 |
+
|
120 |
+
with open(filename, "rb") as f:
|
121 |
+
buffer = io.BytesIO(f.read())
|
122 |
+
data = np.load(buffer)
|
123 |
+
|
124 |
+
os.remove(filename)
|
125 |
+
return data, ""
|
126 |
+
except Exception as e:
|
127 |
+
return None, str(e)
|