Spaces:

suvadityamuk
/

resume-rag

Running on Zero

File size: 14,403 Bytes

import os
import re
import json
import time
import requests
import wandb
import torch
import spaces
from tqdm.auto import tqdm
import psutil
import pymupdf
import gradio as gr
from qdrant_client import QdrantClient
from utils import download_pdf_from_gdrive, merge_strings_with_prefix
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

def rag_query(query: str):
    """
    Allows searching the vector database which contains
    information for a man named Suvaditya for a given query
    by performing semantic search. Returns results by
    looking at his resume, which contains a plethora of
    information about him.

    Args:
        query: The query against which the search will be run,
               in the form a single string phrase no more than
               10 words.

    Returns:
        search_results: A list of results that come closest
                        to the given query semantically,
                        determined by Cosine Similarity.
    """
    return client.query(
        collection_name="resume",
        query_text=query
    )

def generate_answer(chat_history):
    # Generate result
    tool_prompt = tokenizer.apply_chat_template(
        chat_history,
        tools=[rag_query],
        return_tensors="pt",
        return_dict=True,
        add_generation_prompt=True,
    )
    tool_prompt = tool_prompt.to(model.device)
    out = model.generate(
        **tool_prompt, 
        max_new_tokens=512,
        do_sample=True,
        top_p=0.95,
        num_beams=4
    )
    generated_text = out[0, tool_prompt['input_ids'].shape[1]:]
    generated_text = tokenizer.decode(generated_text)
    torch.cuda.empty_cache()
    return generated_text

def parse_tool_request(tool_call, top_k=5):
    pattern = r"<tool_call>(.*?)</tool_call>"
    match_result = re.search(pattern, tool_call, re.DOTALL)
    if match_result:
        result = match_result.group(1).strip()
    else:
        return None, None

    query = json.loads(result)["arguments"]["query"]
    query_results = [
        query_piece.metadata["document"] for query_piece in rag_query(query)
    ]

    return query_results[:top_k], query

def update_chat_history(chat_history, tool_query, query_results):
    assistant_tool_message = {
        "role": "assistant",
        "metadata": "🛠️ Using Qdrant Engine to search for the query 🛠️",
        "tool_calls": [{
            "type": "function",
            "function": {
                "name": "rag_query",
                "arguments": {"query": f"{tool_query}"}
            }
        }]
    }
    result_tool_message = {
        "role": "tool",
        "name": "rag_query",
        "content": "\n".join(query_results)
    }

    chat_history.append(assistant_tool_message)
    chat_history.append(result_tool_message)

    return chat_history

if __name__ == "__main__":
    RESUME_DATA = """

    Suvaditya Mukherjee Email: [email protected]
    Portfolio: suvadityamuk.com Mobile: (213) 827-9733
    Github: github.com/suvadityamuk

    Education
    University of Southern California Master of Science - Computer Science (Artificial Intelligence); GPA: 3.85/4 - Los Angeles, CA, USA
    August 2024 - July 2026
    Courses: Machine Learning, Deep Learning, Advanced Computer Vision, Analysis of Algorithms


    NMIMS Mukesh Patel School of Technology, Management and Engineering
    Bachelor of Technology - Computer Science (Artificial Intelligence); GPA: 3.94/4 - Mumbai, India
    August 2020 - May 2024
    Courses: Deep Learning, Data Structures and Algorithms, Machine Learning, Natural Language Processing, Software Engineering,
    Operating Systems, Mathematics, Computer Organization and Architecture, Computer Networks, Database Management Systems


    Experience


    USC Institute of Creative Technologies Los Angeles, CA, USA
    Machine Learning Student Worker - Learning Sciences Lab (Part-time) September 2024 - Present

    Course Generation using Generative AI: Leverage Generative AI with LangChain and OpenAI to help make novel
    techniques for course generation, tutoring content generation, and OpenTutor courses to learn and teach AI for the
    AIRCOEE program in collaboration with the US Department of Defense, under Prof. (Dr.) Benjamin Nye.

    Cogeneration Testbed: Maintain technologies for co-generation of tutoring content using open and cloud-based LLMs
    to help educators.

    
    USC School of Cinematic Arts Los Angeles, CA, USA
    Machine Learning Assistant - Interactive Games Division (Part-time) September 2024 - Present

    Student Worker: Assist Prof. (Dr.) Mark Bolas to develop an introductory Python Programming course for Game
    Developers.

    ML Research: Find new approaches to apply Generative AI based on LLMs and Diffusion Models to solve problems at
    large-scale in Creative Media, with solutions such as generating scripts and summaries based on videos.
    
    
    HARMAN International Bengaluru, India
    Machine Learning Intern (Full-time) December 2023 - May 2024

    K-Shot Rotation-Invariant Object Detection Pipeline Development: Produced new Intellectual Property
    towards achieving a robust pipeline to perform K-shot object detection without dependence on rotation alignment.
    Improved pipeline with 35\% better results on client data
    
    Zero-shot Time-Series Forecasting with LLMs: Researched on how to achieve zero-shot time-series forecasting
    through LLMs while building on previous developments.
    
    Spot Instance Handler using Agentic LLMs: Built an agent-based LLM system on Gemini 1.5 Pro and LangChain
    to help reduce costs by 10\% incurred, by running non-critical workloads on spot-instances
    
    
    Center for Visual Information Technology, IIIT-Hyderabad Hyderabad, India
    Research Intern (Full-time) June 2023 - November 2023
    
    Research: Contributed towards research along Domain Adaptation problems in Autonomous Driving under Prof. C.V.
    Jawahar and Prof. Shankar Gangisetty
    
    Code Implementations: Operated with internal tools to execute large-scale GPU training and experimentation on
    Image Segmentation problems
    
    
    UnifyAI (Ivy) London, United Kingdom
    ML Research Engineer Intern (Full-time) January 2023 - July 2023
    
    Demos and Examples: Developed new demos, examples, and guides to internal and external official documentation,
    most notably around converting torchvision models into TFLite. Also helped in establishing programs and managing the
    Google Summer of Code program as an Organization Admin
    
    Internal AI Developer: Prototyped an AI Developer (Code-LLM) to automate and builds upon existing codebases and
    speeds up internal development, along with handling self-training through Cloud resources such as GCP and AWS

    
    Publications and Research
    
    Presentation: Pushing the Performance Envelope : An Optimization Study for 3D Generative Modelling with
    PyTorch: Work on finding techniques to optimize 3D Text-to-Image Mesh generation [Accepted at PyTorch Conference 2024]
    
    Paper: Guiding the Student\’s Learning Curve: Augmenting Knowledge Distillation with Insights from
    GradCAM: Work on investigating the effects of using GradCAM representations of Teacher models as direct inputs to
    Student models for quicker convergence. [Accepted]
    
    Paper: Project Lingua Franca: Democratizing Information through Unified Optical Character Recognition
    and Neural Machine Translation: Work on combined Optical Character Recognition and Neural Machine Translation for
    information translation with high-impact languages as targets [Accepted]

    
    Leadership
    
    Google Developer Expert: Recognized and selected as a top contributor to the Google ML Developer Community. Work
    towards creating detailed tutorials, delivering talks around Deep Learning, and helping beta-test new products on GCP Vertex
    AI and Gemini suite of tools.
    
    Google Summer of Code: (Org Admin and Mentor) Mentored incoming students for completing tasks, handled
    communications with Google Open Source Programs Office for compliance.
    """
    # RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
    # RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"

    # ONNX_MODEL_PATH = "https://huggingface.co/onnx-community/Qwen2.5-1.5B-Instruct/resolve/main/onnx/model.onnx_data"
    # SAVE_PATH = "./model.onnx_data"

    # print("Downloading ONNX model...")
    # response = requests.get(ONNX_MODEL_PATH, stream=True)
    # response.raise_for_status()
    
    # total_size = int(response.headers.get('content-length', 0))
    
    # with open(SAVE_PATH, 'wb') as file, tqdm(
    #     desc=os.path.basename(SAVE_PATH),
    #     total=total_size,
    #     unit='iB',
    #     unit_scale=True
    # ) as pbar:
    #     for data in response.iter_content(chunk_size=8192):
    #         size = file.write(data)
    #         pbar.update(size)
    # print("Downloaded ONNX model!")

    # Download file
    # download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)

    # doc = pymupdf.open(RESUME_PATH)

    # fulltext = doc[0].get_text().split("\n")

    # fulltext = merge_strings_with_prefix(fulltext)

    fulltext = RESUME_DATA.split("\n\n")

    print(fulltext)

    # Embed the sentences
    # client = QdrantClient(":memory:", optimize_for_ram_usage=True)
    client = QdrantClient(":memory:")

    client.set_model("sentence-transformers/all-MiniLM-L6-v2")

    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    if not client.collection_exists(collection_name="resume"):
        client.create_collection(
            collection_name="resume",
            vectors_config=client.get_fastembed_vector_params(),
        )

    _ = client.add(
        collection_name="resume",
        documents=fulltext,
        ids=range(len(fulltext)),
        batch_size=100,
        # parallel=0,
    )

    # wandb.login(
    #     key=os.getenv("WANDB_API_KEY")
    # )

    model_name = "Qwen/Qwen2.5-3B-Instruct"

    # wandb.init(
    #     project="resume-rag", 
    #     name="zerogpu-run",
    #     save_code=True,
    #     config={
    #         "model_name": model_name,
    #         "resume_url": RESUME_URL
    #     }
    # )

    # wandb.login(
    #     key=os.getenv("WANDB_API_KEY")
    # )

    @spaces.GPU
    def rag_process(message, chat_history):
        if not chat_history:
            system_message = {
                "role": "system",
                "content": """You are an AI assistant focused on answering questions about Suvaditya's resume. 
                Only provide information that is explicitly mentioned in the resume data. 
                If you're unsure about any information, refuse to answer and direct users to suvadityamuk.com. 
                Be accurate and concise in your responses. """
            }
            chat_history = [system_message]
        # wandb.init(
        #     project="resume-rag", 
        #     name="zerogpu-run",
        #     save_code=True,
        #     config={
        #         "model_name": model_name,
        #         "resume_url": RESUME_URL
        #     }
        # )
        # Append current user message to chat history
        current_message = {
            "role": "user",
            "content": message
        }
        chat_history.append(current_message)

        # start_time = time.time()
        # Generate LLM answer
        generated_text = generate_answer(chat_history)
        # generated_text = onnx_inference(chat_history, rag_query, tokenizer)

        # Detect if tool call is requested by LLM. If yes, then
        # execute tool and use else return None
        query_results, tool_query = parse_tool_request(generated_text)

        # If tool call was requested
        if query_results is not None and tool_query is not None:
            # Update chat history with result of tool call
            chat_history = update_chat_history(
                chat_history, tool_query, query_results
            )
            # Generate result from the
            generated_text = generate_answer(chat_history)
            # generated_text = onnx_inference(chat_history, rag_query, tokenizer)

        # metrics = {
        #     "conversation": {
        #         "turn": len(chat_history) // 2,
        #         "history": chat_history,
        #         "current_question": message,
        #         "current_answer": generated_text[:-10],
        #         "tool_query": tool_query,
        #         "rag_results": query_results
        #     },
        #     "performance": {
        #         "response_time": time.time() - start_time,
        #         "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
        #         "cpu_memory": psutil.Process().memory_info().rss,
        #         # "gpu_utilization": torch.cuda.utilization() if torch.cuda.is_available() else 0
        #     }
        # }
        # wandb.log(metrics)

        return generated_text[:-10]

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        # quantization_config=QuantoConfig(
        #     weights="int8",
        # )
        # quantization_config = BitsAndBytesConfig(
        #     load_in_8bit=True,
        #     # bnb_4bit_compute_dtype=torch.float16,
        #     # bnb_4bit_quant_type="nf4"
        # )
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    demo = gr.ChatInterface(
        fn=rag_process,
        type="messages",
        title="Suvaditya's Personal RAG, a space on ZeroGPU!",
        examples=["Where did Suvaditya complete his Bachelor's Degree?", "Where is Suvaditya currently working?"],
        description="Ask any question about Suvaditya's resume and get an answer! \n\nNote: Sometimes, as always, the LLM may give wrong answers. Here's a link to my [resume](https://suvadityamuk.com/uploads/resume.pdf), if you'd like to go through it yourself! Get in touch with me through [X](https://x.com/halcyonrayes), [Gmail](mailto:[email protected]), [LinkedIn](https://www.linkedin.com/in/suvadityamukherjee), or [schedule a meeting with me here](https://cal.com/suvadityamuk)",
        theme="John6666/YntecDark",
    )
    demo.launch()   

    # wandb.finish()