Spaces:

suvadityamuk
/

resume-rag

Sleeping

App Files Files Community

suvadityamuk commited on Feb 19

Commit

01e09d8

1 Parent(s): 0c9c63c

chore: download onnx-data on spaces

Browse files

Signed-off-by: Suvaditya Mukherjee <[email protected]>

Files changed (2) hide show

app.py +21 -21
utils.py +1 -132

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import psutil
 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
-from utils import download_pdf_from_gdrive, merge_strings_with_prefix, onnx_inference
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
@@ -101,25 +101,25 @@ if __name__ == "__main__":
     RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
     RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
-    ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
-    SAVE_PATH = "./model.onnx_data"
-    print("Downloading ONNX model...")
-    response = requests.get(ONNX_MODEL_PATH, stream=True)
-    response.raise_for_status()
-    total_size = int(response.headers.get('content-length', 0))
-    with open(SAVE_PATH, 'wb') as file, tqdm(
-        desc=os.path.basename(SAVE_PATH),
-        total=total_size,
-        unit='iB',
-        unit_scale=True
-    ) as pbar:
-        for data in response.iter_content(chunk_size=8192):
-            size = file.write(data)
-            pbar.update(size)
-    print("Downloaded ONNX model!")
     # Download file
     download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
@@ -190,8 +190,8 @@ if __name__ == "__main__":
         # start_time = time.time()
         # Generate LLM answer
-        # generated_text = generate_answer(chat_history)
-        generated_text = onnx_inference(chat_history, rag_query, tokenizer)
         # Detect if tool call is requested by LLM. If yes, then
         # execute tool and use else return None
@@ -204,8 +204,8 @@ if __name__ == "__main__":
                 chat_history, tool_query, query_results
             )
             # Generate result from the
-            # generated_text = generate_answer(chat_history)
-            generated_text = onnx_inference(chat_history, rag_query, tokenizer)
         # metrics = {
         #     "conversation": {

 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
+from utils import download_pdf_from_gdrive, merge_strings_with_prefix
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
     RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
     RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
+    # ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
+    # SAVE_PATH = "./model.onnx_data"
+    # print("Downloading ONNX model...")
+    # response = requests.get(ONNX_MODEL_PATH, stream=True)
+    # response.raise_for_status()
+    # total_size = int(response.headers.get('content-length', 0))
+    # with open(SAVE_PATH, 'wb') as file, tqdm(
+    #     desc=os.path.basename(SAVE_PATH),
+    #     total=total_size,
+    #     unit='iB',
+    #     unit_scale=True
+    # ) as pbar:
+    #     for data in response.iter_content(chunk_size=8192):
+    #         size = file.write(data)
+    #         pbar.update(size)
+    # print("Downloaded ONNX model!")
     # Download file
     download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
         # start_time = time.time()
         # Generate LLM answer
+        generated_text = generate_answer(chat_history)
+        # generated_text = onnx_inference(chat_history, rag_query, tokenizer)
         # Detect if tool call is requested by LLM. If yes, then
         # execute tool and use else return None
                 chat_history, tool_query, query_results
             )
             # Generate result from the
+            generated_text = generate_answer(chat_history)
+            # generated_text = onnx_inference(chat_history, rag_query, tokenizer)
         # metrics = {
         #     "conversation": {

utils.py CHANGED Viewed

@@ -177,135 +177,4 @@ def scrape_website(start_url, delay=1):
     # Combine all content into a single string
     combined_content = "\n\n".join(all_content)
-    return combined_content
-def onnx_inference(chat_history: list, rag_query: str, tokenizer) -> str:
-    """
-    Performs ONNX inference with dynamic input handling, optimized for conciseness.
-    """
-    session = onnxruntime.InferenceSession("model.onnx")
-    model_inputs = session.get_inputs()
-    model_outputs = session.get_outputs()
-    # --- Corrected Chat History and Tool Call Format ---
-    # The tool call needs to be *part* of the chat history.
-    chat_history_with_tool = chat_history + [
-        {"role": "user", "content": rag_query, "tools": [{"type": "retrieval"}]},
-    ]
-    # Use HF tokenizer for input preparation
-    inputs = tokenizer.apply_chat_template(
-        chat_history_with_tool,
-        return_tensors="np",
-        add_generation_prompt=True
-    )
-    input_ids = inputs["input_ids"] #Corrected: Access input_ids correctly
-    attention_mask = inputs["attention_mask"]
-    # Determine required inputs
-    has_position_ids = "position_ids" in (inp.name for inp in model_inputs)
-    has_past_key_values = any("past_key_values" in inp.name for inp in model_inputs)
-    # Prepare initial inputs, including past_key_values if needed
-    ort_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
-    if has_position_ids:
-        ort_inputs["position_ids"] = np.arange(input_ids.shape[1], dtype=np.int64).reshape(1, -1)
-    if has_past_key_values:
-        # Dummy run to get past_key_values shape
-        dummy_inputs = {
-            "input_ids": np.array([[0]]),
-            "attention_mask": np.array([[1]]),
-            "position_ids": np.array([[0]]) if has_position_ids else None  # Add if needed
-        }
-        dummy_inputs = {k: v for k, v in dummy_inputs.items() if v is not None}  # Remove None values
-        sample_outputs = session.run(None, dummy_inputs)
-        pkv_shape = list(sample_outputs[1].shape)
-        pkv_shape[2] = 0  # Set sequence length to 0 for initial state
-        num_pkv = len([inp for inp in model_inputs if "past_key_values" in inp.name])
-        past_key_values = tuple([np.zeros(pkv_shape, dtype=np.float32) for _ in range(num_pkv)])
-        # Add initial past_key_values to ort_inputs
-        for i in range(len(past_key_values) // 2):
-            ort_inputs[f"past_key_values.{i}.key"] = past_key_values[i * 2]
-            ort_inputs[f"past_key_values.{i}.value"] = past_key_values[i * 2 + 1]
-    generated_ids = []
-    input_length = input_ids.shape[1]
-    # Generation loop with dynamic input updates
-    for _ in range(512):  # Max new tokens
-        ort_outputs = session.run(None, ort_inputs)
-        next_token_logits = torch.tensor(ort_outputs[0][:, -1, :])
-        next_token = torch.multinomial(torch.softmax(next_token_logits / 1.0, dim=-1), num_samples=1).numpy()
-        generated_ids.append(next_token[0, 0])
-        if next_token[0, 0] == tokenizer.eos_token_id:
-            break
-        # Update inputs for next iteration
-        ort_inputs["input_ids"] = next_token
-        ort_inputs["attention_mask"] = np.ones_like(next_token)
-        if has_position_ids:
-            ort_inputs["position_ids"] = np.array([[input_length]], dtype=np.int64)
-            input_length += 1
-        if has_past_key_values:
-            for i in range(len(ort_outputs) -1):  # Iterate over model outputs, excluding logits
-                ort_inputs[model_inputs[i+2].name] = ort_outputs[i+1] # Use names for robustness
-    return tokenizer.decode(generated_ids, skip_special_tokens=True)
-# def onnx_inference(chat_history, rag_query, tokenizer):
-#     # Create ONNX Runtime session
-#     session = onnxruntime.InferenceSession("model.onnx")
-#     # Tokenize input text
-#     inputs = tokenizer.apply_chat_template(
-#         chat_history,
-#         tools=[rag_query],
-#         return_tensors="np",
-#         return_dict=True,
-#         add_generation_prompt=True,
-#         # padding=True
-#     )
-#     # Run inference
-#     ort_inputs = {
-#         "input_ids": inputs["input_ids"],
-#         "attention_mask": inputs["attention_mask"]
-#     }
-#     input_length = inputs["input_ids"].shape[1]
-#     max_new_tokens = 512
-#     # Run generation
-#     for _ in range(max_new_tokens):
-#         ort_outputs = session.run(None, ort_inputs)
-#         next_token_logits = ort_outputs[0][:, -1, :]
-#         # Apply sampling
-#         next_token_logits = torch.tensor(next_token_logits)
-#         probs = torch.nn.functional.softmax(next_token_logits / 1.0, dim=-1)
-#         next_token = torch.multinomial(probs, num_samples=1).numpy()
-#         # Append to input
-#         ort_inputs["input_ids"] = np.concatenate([ort_inputs["input_ids"], next_token], axis=1)
-#         ort_inputs["attention_mask"] = np.concatenate([
-#             ort_inputs["attention_mask"],
-#             np.ones_like(next_token)
-#         ], axis=1)
-#         # Check for EOS token
-#         if next_token[0, 0] == tokenizer.eos_token_id:
-#             break
-#     # Decode only the new tokens
-#     generated_ids = ort_inputs["input_ids"][0, input_length:]
-#     generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
-#     return generated_text

     # Combine all content into a single string
     combined_content = "\n\n".join(all_content)
+    return combined_content