Spaces:

suvadityamuk
/

resume-rag

Sleeping

App Files Files Community

suvadityamuk commited on Feb 19

Commit

66e5432

1 Parent(s): c711155

chore: add onnx

Browse files

Signed-off-by: Suvaditya Mukherjee <[email protected]>

Files changed (4) hide show

app.py +3 -4
model.onnx +3 -0
requirements.txt +2 -1
utils.py +53 -0

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import psutil
 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
-from utils import download_pdf_from_gdrive, merge_strings_with_prefix, scrape_website
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
@@ -120,8 +120,6 @@ if __name__ == "__main__":
             vectors_config=client.get_fastembed_vector_params(),
         )
-    print("fulltext", fulltext)
     _ = client.add(
         collection_name="resume",
         documents=fulltext,
@@ -170,7 +168,8 @@ if __name__ == "__main__":
         # start_time = time.time()
         # Generate LLM answer
-        generated_text = generate_answer(chat_history)
         # Detect if tool call is requested by LLM. If yes, then
         # execute tool and use else return None

 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
+from utils import download_pdf_from_gdrive, merge_strings_with_prefix, onnx_inference
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
             vectors_config=client.get_fastembed_vector_params(),
         )
     _ = client.add(
         collection_name="resume",
         documents=fulltext,
         # start_time = time.time()
         # Generate LLM answer
+        # generated_text = generate_answer(chat_history)
+        generated_text = onnx_inference(chat_history, rag_query, tokenizer)
         # Detect if tool call is requested by LLM. If yes, then
         # execute tool and use else return None

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f7da5d0bb5e5b6eba0ba0e9c006fcbc8b670134405ef5e02aaaf738361e2074
+size 1057843

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ psutil
 optimum-quanto
 pynvml
 beautifulsoup4
-requests

 optimum-quanto
 pynvml
 beautifulsoup4
+requests
+onnxruntime

utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import gdown
 import os
 from urllib.parse import urlparse, parse_qs, urljoin
 import requests
 from bs4 import BeautifulSoup
@@ -175,3 +178,53 @@ def scrape_website(start_url, delay=1):
     # Combine all content into a single string
     combined_content = "\n\n".join(all_content)
     return combined_content

 import gdown
 import os
+import numpy as np
+import torch
+import onnxruntime
 from urllib.parse import urlparse, parse_qs, urljoin
 import requests
 from bs4 import BeautifulSoup
     # Combine all content into a single string
     combined_content = "\n\n".join(all_content)
     return combined_content
+def onnx_inference(chat_history, rag_query, tokenizer):
+    # Create ONNX Runtime session
+    session = onnxruntime.InferenceSession("model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+    # Tokenize input text
+    inputs = tokenizer.apply_chat_template(
+        chat_history,
+        tools=[rag_query],
+        return_tensors="np",
+        return_dict=True,
+        add_generation_prompt=True,
+        # padding=True
+    )
+    # Run inference
+    ort_inputs = {
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"]
+    }
+    input_length = inputs["input_ids"].shape[1]
+    max_new_tokens = 512
+    # Run generation
+    for _ in range(max_new_tokens):
+        ort_outputs = session.run(None, ort_inputs)
+        next_token_logits = ort_outputs[0][:, -1, :]
+        # Apply sampling
+        next_token_logits = torch.tensor(next_token_logits)
+        probs = torch.nn.functional.softmax(next_token_logits / 1.0, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1).numpy()
+        # Append to input
+        ort_inputs["input_ids"] = np.concatenate([ort_inputs["input_ids"], next_token], axis=1)
+        ort_inputs["attention_mask"] = np.concatenate([
+            ort_inputs["attention_mask"],
+            np.ones_like(next_token)
+        ], axis=1)
+        # Check for EOS token
+        if next_token[0, 0] == tokenizer.eos_token_id:
+            break
+    # Decode only the new tokens
+    generated_ids = ort_inputs["input_ids"][0, input_length:]
+    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+    return generated_text