Spaces:

JaynilJaiswal
/

test

Runtime error

App Files Files Community

JaynilJaiswal commited on May 1

Commit

5073484

verified ·

1 Parent(s): 350b937

Create app.py

Browse files

Files changed (1) hide show

app.py +520 -0

app.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# %%capture
+# # Run this cell in your local environment to install necessary packages
+# # Added chromadb, removed scikit-learn (numpy might still be needed by other libs)
+# !pip install gradio langchain langchain-community sentence-transformers ctransformers torch accelerate bitsandbytes chromadb transformers[sentencepiece]
+import gradio as gr
+from langchain_community.vectorstores import Chroma # ADDED
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms import CTransformers
+from langchain.schema import Document
+from langchain.prompts import PromptTemplate
+import json
+import os
+# REMOVED: import numpy as np
+import re
+# REMOVED: from sklearn.metrics.pairwise import cosine_similarity
+import chromadb # ADDED for client check
+from typing import List, Dict, Any, Optional
+# --- Load Structured Resume Data ---
+resume_filename = "resume_corrected.json" # Using the revamped JSON
+resume_data = {}
+try:
+    with open(resume_filename, 'r', encoding='utf-8') as f:
+        resume_data = json.load(f)
+    print(f"Loaded structured resume data from {resume_filename}")
+    if not isinstance(resume_data, dict):
+        print(f"Error: Content of {resume_filename} is not a dictionary.")
+        resume_data = {}
+except FileNotFoundError:
+    print(f"Error: Resume data file '{resume_filename}' not found.")
+    print("Ensure the revamped JSON file is present.")
+    exit()
+except json.JSONDecodeError as e:
+    print(f"Error decoding JSON from {resume_filename}: {e}")
+    exit()
+except Exception as e:
+    print(f"An unexpected error occurred loading resume data: {e}")
+    exit()
+if not resume_data:
+    print("Error: No resume data loaded. Exiting.")
+    exit()
+# --- Function to Sanitize Metadata ---
+# --- Helper Function to Sanitize Metadata ---
+def sanitize_metadata(metadata_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """Ensures metadata values are compatible types for ChromaDB."""
+    sanitized = {}
+    if not isinstance(metadata_dict, dict):
+        return {} # Return empty if input is not a dict
+    for k, v in metadata_dict.items():
+        # Ensure key is string
+        key_str = str(k)
+        if isinstance(v, (str, int, float, bool)):
+            sanitized[key_str] = v
+        elif isinstance(v, (list, set)): # Convert lists/sets to string
+            sanitized[key_str] = "; ".join(map(str, v))
+        elif v is None:
+            sanitized[key_str] = "N/A" # Or ""
+        else:
+            sanitized[key_str] = str(v) # Convert other types to string
+    return sanitized
+# --- Create Granular LangChain Documents from Structured Data ---
+# (This entire section remains unchanged as requested)
+structured_docs = []
+doc_id_counter = 0
+print("Processing structured data into granular documents...")
+# --- Start of Unchanged Document Creation Logic ---
+contact_info = resume_data.get("CONTACT INFO", {})
+if contact_info:
+    contact_text = f"Contact Info: Phone: {contact_info.get('phone', 'N/A')}, Location: {contact_info.get('location', 'N/A')}, Email: {contact_info.get('email', 'N/A')}, GitHub: {contact_info.get('github_user', 'N/A')}, LinkedIn: {contact_info.get('linkedin_user', 'N/A')}"
+    metadata = {"category": "CONTACT INFO", "source_doc_id": str(doc_id_counter)} # Ensure ID is string
+    structured_docs.append(Document(page_content=contact_text, metadata=metadata))
+    doc_id_counter += 1
+education_list = resume_data.get("EDUCATION", [])
+for i, entry in enumerate(education_list):
+    edu_text = f"Education: {entry.get('degree', '')} in {entry.get('major', '')} from {entry.get('institution', '')} ({entry.get('dates', '')})."
+    metadata = {
+        "category": "EDUCATION",
+        "institution": entry.get('institution', 'N/A'), # Ensure N/A or actual string
+        "degree": entry.get('degree', 'N/A'),
+        "major": entry.get('major', 'N/A'),
+        "dates": entry.get('dates', 'N/A'),
+        "item_index": i,
+        "source_doc_id": str(doc_id_counter) # Ensure ID is string
+    }
+    # Ensure all metadata values are strings, ints, floats, or bools
+    metadata = {k: (v if isinstance(v, (str, int, float, bool)) else str(v)) for k, v in metadata.items()}
+    structured_docs.append(Document(page_content=edu_text.strip(), metadata=metadata))
+    doc_id_counter += 1
+tech_strengths = resume_data.get("TECHNICAL STRENGTHS", {})
+for sub_category, skills in tech_strengths.items():
+    if isinstance(skills, list) and skills:
+        skills_text = f"Technical Strengths - {sub_category}: {', '.join(skills)}"
+        metadata = {"category": "TECHNICAL STRENGTHS", "sub_category": sub_category, "source_doc_id": str(doc_id_counter)}
+        metadata = {k: (v if isinstance(v, (str, int, float, bool)) else str(v)) for k, v in metadata.items()}
+        structured_docs.append(Document(page_content=skills_text, metadata=metadata))
+        doc_id_counter += 1
+# Process WORK EXPERIENCE (Using relevant_skills)
+work_list = resume_data.get("WORK EXPERIENCE", [])
+for i, entry in enumerate(work_list):
+    title = entry.get('title', 'N/A')
+    org = entry.get('organization', 'N/A')
+    dates = entry.get('dates', 'N/A')
+    points = entry.get('description_points', [])
+    # --- MODIFICATION START ---
+    skills_list = entry.get('relevant_skills', []) # Get pre-associated skills
+    skills_str = "; ".join(skills_list) if skills_list else "N/A"
+    # --- MODIFICATION END ---
+    entry_context = f"Work Experience: {title} at {org} ({dates})"
+    if not points:
+        base_metadata = {
+            "category": "WORK EXPERIENCE", "title": title, "organization": org,
+            "dates": dates, "item_index": i, "point_index": -1,
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        doc_id_counter += 1
+    else:
+        # Create one doc for the header/context info
+        base_metadata = {
+            "category": "WORK EXPERIENCE", "title": title, "organization": org,
+            "dates": dates, "item_index": i, "point_index": -1, # Indicate context doc
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        # Create separate docs for each point, inheriting skills
+        for j, point in enumerate(points):
+            point_text = f"{entry_context}:\n- {point.strip()}"
+            point_metadata = {
+                "category": "WORK EXPERIENCE", "title": title, "organization": org,
+                "dates": dates, "item_index": i, "point_index": j,
+                "source_doc_id": str(doc_id_counter), # Link back to original entry ID
+                "skills": skills_str # --- ADDED SKILLS ---
+            }
+            structured_docs.append(Document(page_content=point_text, metadata=sanitize_metadata(point_metadata)))
+        doc_id_counter += 1 # Increment ID only once per WORK EXPERIENCE entry
+# Process PROJECTS (Using technologies field, mapping to 'skills' metadata key)
+project_list = resume_data.get("PROJECTS", [])
+for i, entry in enumerate(project_list):
+    name = entry.get('name', 'Unnamed Project')
+    # --- MODIFICATION START ---
+    # Use 'technologies' from JSON for projects, but map to 'skills' metadata key
+    skills_list = entry.get('technologies', [])
+    skills_str = "; ".join(skills_list) if skills_list else "N/A"
+    # --- MODIFICATION END ---
+    points = entry.get('description_points', [])
+    # Include skills string in context text as well for embedding
+    entry_context = f"Project: {name} (Skills: {skills_str if skills_list else 'N/A'})"
+    if not points:
+        base_metadata = {
+            "category": "PROJECTS", "name": name,
+            "item_index": i, "point_index": -1,
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED/RENAMED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        doc_id_counter += 1
+    else:
+         # Create one doc for the header/context info
+        base_metadata = {
+            "category": "PROJECTS", "name": name,
+            "item_index": i, "point_index": -1, # Indicate context doc
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED/RENAMED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        # Create separate docs for each point, inheriting skills
+        for j, point in enumerate(points):
+            point_text = f"{entry_context}:\n- {point.strip()}"
+            point_metadata = {
+                "category": "PROJECTS", "name": name,
+                "item_index": i, "point_index": j,
+                "source_doc_id": str(doc_id_counter),
+                "skills": skills_str # --- ADDED/RENAMED SKILLS ---
+            }
+            structured_docs.append(Document(page_content=point_text, metadata=sanitize_metadata(point_metadata)))
+        doc_id_counter += 1 # Increment ID only once per PROJECT entry
+# Process ONLINE CERTIFICATIONS (Using relevant_skills)
+cert_list = resume_data.get("ONLINE CERTIFICATIONS", [])
+for i, entry in enumerate(cert_list):
+    name = entry.get('name', 'N/A')
+    issuer = entry.get('issuer', 'N/A')
+    date = entry.get('date', 'N/A')
+    points = entry.get('description_points', [])
+    # --- MODIFICATION START ---
+    skills_list = entry.get('relevant_skills', []) # Get pre-associated skills
+    skills_str = "; ".join(skills_list) if skills_list else "N/A"
+    # --- MODIFICATION END ---
+    entry_context = f"Certification: {name} from {issuer} ({date})"
+    if not points:
+        base_metadata = {
+            "category": "ONLINE CERTIFICATIONS", "name": name, "issuer": issuer,
+            "date": date, "item_index": i, "point_index": -1,
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        doc_id_counter += 1
+    else:
+         # Create one doc for the header/context info
+        base_metadata = {
+            "category": "ONLINE CERTIFICATIONS", "name": name, "issuer": issuer,
+            "date": date, "item_index": i, "point_index": -1, # Indicate context doc
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        # Create separate docs for each point, inheriting skills
+        for j, point in enumerate(points):
+            if point.strip().endswith(':'): continue
+            point_text = f"{entry_context}:\n- {point.strip().lstrip('–- ')}"
+            point_metadata = {
+                "category": "ONLINE CERTIFICATIONS", "name": name, "issuer": issuer,
+                "date": date, "item_index": i, "point_index": j,
+                "source_doc_id": str(doc_id_counter),
+                "skills": skills_str # --- ADDED SKILLS ---
+            }
+            structured_docs.append(Document(page_content=point_text, metadata=sanitize_metadata(point_metadata)))
+        doc_id_counter += 1 # Increment ID only once per CERTIFICATION entry
+# Process COURSES (Using relevant_skills)
+course_list = resume_data.get("COURSES", [])
+for i, entry in enumerate(course_list):
+    code = entry.get('code', '')
+    name = entry.get('name', 'N/A')
+    inst = entry.get('institution', 'N/A')
+    term = entry.get('term', 'N/A')
+    points = entry.get('description_points', [])
+    # --- MODIFICATION START ---
+    skills_list = entry.get('relevant_skills', []) # Get pre-associated skills
+    skills_str = "; ".join(skills_list) if skills_list else "N/A"
+    # --- MODIFICATION END ---
+    entry_context = f"Course: {code}: {name} at {inst} ({term})"
+    if not points:
+        base_metadata = {
+            "category": "COURSES", "code": code, "name": name, "institution": inst,
+            "term": term, "item_index": i, "point_index": -1,
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        doc_id_counter += 1
+    else:
+        # Create one doc for the header/context info
+        base_metadata = {
+            "category": "COURSES", "code": code, "name": name, "institution": inst,
+            "term": term, "item_index": i, "point_index": -1, # Indicate context doc
+            "source_doc_id": str(doc_id_counter),
+            "skills": skills_str # --- ADDED SKILLS ---
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+         # Create separate docs for each point, inheriting skills
+        for j, point in enumerate(points):
+            point_text = f"{entry_context}:\n- {point.strip()}"
+            point_metadata = {
+                "category": "COURSES", "code": code, "name": name, "institution": inst,
+                "term": term, "item_index": i, "point_index": j,
+                "source_doc_id": str(doc_id_counter),
+                "skills": skills_str # --- ADDED SKILLS ---
+            }
+            structured_docs.append(Document(page_content=point_text, metadata=sanitize_metadata(point_metadata)))
+        doc_id_counter += 1 # Increment ID only once per COURSE entry
+# Process EXTRACURRICULAR ACTIVITIES (No skills assumed here)
+extra_list = resume_data.get("EXTRACURRICULAR ACTIVITIES", [])
+for i, entry in enumerate(extra_list):
+    org = entry.get('organization', 'N/A')
+    points = entry.get('description_points', [])
+    entry_context = f"Extracurricular: {org}"
+    if not points:
+        metadata = {
+            "category": "EXTRACURRICULAR ACTIVITIES", "organization": org,
+            "item_index": i, "point_index": -1,
+            "source_doc_id": str(doc_id_counter)
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(metadata)))
+        doc_id_counter += 1
+    else:
+         # Create one doc for the header/context info
+        base_metadata = {
+            "category": "EXTRACURRICULAR ACTIVITIES", "organization": org,
+            "item_index": i, "point_index": -1, # Indicate context doc
+            "source_doc_id": str(doc_id_counter)
+        }
+        structured_docs.append(Document(page_content=entry_context, metadata=sanitize_metadata(base_metadata)))
+        # Create separate docs for each point
+        for j, point in enumerate(points):
+            point_text = f"{entry_context}:\n- {point.strip()}"
+            point_metadata = {
+                "category": "EXTRACURRICULAR ACTIVITIES", "organization": org,
+                "item_index": i, "point_index": j,
+                "source_doc_id": str(doc_id_counter)
+            }
+            structured_docs.append(Document(page_content=point_text, metadata=sanitize_metadata(point_metadata)))
+        doc_id_counter += 1
+if not structured_docs:
+    print("Error: Failed to create any documents from the resume data. Check processing logic.")
+    exit()
+print(f"Created {len(structured_docs)} granular Document objects.")
+# Optional: Print a sample document
+print("\nSample Document:")
+print(structured_docs[0]) # Print first doc as example
+# --- Embeddings Model ---
+print("Initializing embeddings model...")
+embeddings_model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+print(f"Embeddings model '{embeddings_model_name}' initialized.")
+# --- ChromaDB Vector Store Setup ---
+CHROMA_PERSIST_DIR = "/data/cv_chroma_db_structured" # Use a different dir if needed
+CHROMA_COLLECTION_NAME = "cv_structured_collection"
+print(f"Connecting to ChromaDB client at '{CHROMA_PERSIST_DIR}'...")
+client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
+vectorstore = None
+collection_exists = False
+collection_count = 0
+try:
+    existing_collections = [col.name for col in client.list_collections()]
+    if CHROMA_COLLECTION_NAME in existing_collections:
+        collection = client.get_collection(name=CHROMA_COLLECTION_NAME)
+        collection_count = collection.count()
+        if collection_count > 0:
+             collection_exists = True
+             print(f"Collection '{CHROMA_COLLECTION_NAME}' already exists with {collection_count} documents.")
+        else:
+             print(f"Collection '{CHROMA_COLLECTION_NAME}' exists but is empty. Will attempt to create/populate.")
+             collection_exists = False
+             try:
+                 client.delete_collection(name=CHROMA_COLLECTION_NAME)
+                 print(f"Deleted empty collection '{CHROMA_COLLECTION_NAME}'.")
+             except Exception as delete_e:
+                 print(f"Warning: Could not delete potentially empty collection '{CHROMA_COLLECTION_NAME}': {delete_e}")
+    else: print(f"Collection '{CHROMA_COLLECTION_NAME}' does not exist. Will create.")
+except Exception as e:
+    print(f"Error checking/preparing ChromaDB collection: {e}. Assuming need to create.")
+    collection_exists = False
+# Populate Vector Store ONLY IF NEEDED
+if not collection_exists:
+    print("\nPopulating ChromaDB vector store (this may take a moment)...")
+    if not structured_docs:
+         print("Error: No documents to add to vector store.")
+         exit()
+    try:
+        vectorstore = Chroma.from_documents(
+            documents=structured_docs,
+            embedding=embeddings, # Use the initialized embeddings function
+            collection_name=CHROMA_COLLECTION_NAME,
+            persist_directory=CHROMA_PERSIST_DIR
+        )
+        vectorstore.persist()
+        print("Vector store populated and persisted.")
+    except Exception as e:
+        print(f"\n--- Error during ChromaDB storage: {e} ---")
+        print("Check metadata types (should be str, int, float, bool).")
+        exit()
+else: # Load existing store
+    print(f"\nLoading existing vector store from '{CHROMA_PERSIST_DIR}'...")
+    try:
+        vectorstore = Chroma(
+            persist_directory=CHROMA_PERSIST_DIR,
+            embedding_function=embeddings,
+            collection_name=CHROMA_COLLECTION_NAME
+        )
+        print("Existing vector store loaded successfully.")
+    except Exception as e:
+        print(f"\n--- Error loading existing ChromaDB store: {e} ---")
+        exit()
+if not vectorstore:
+     print("Error: Vector store could not be loaded or created. Exiting.")
+     exit()
+# --- Load Fine-tuned CTransformers model ---
+# (This part remains unchanged)
+model_path_gguf = "/data/zephyr-7b-beta.Q4_K_M.gguf" # MAKE SURE THIS PATH IS CORRECT
+print(f"Initializing Fine-Tuned CTransformers LLM from: {model_path_gguf}")
+config = {
+    'max_new_tokens': 512, 'temperature': 0.1, 'context_length': 2048,
+    'gpu_layers': 0, 'stream': False, 'threads': -1, 'top_k': 40,
+    'top_p': 0.9, 'repetition_penalty': 1.1
+    }
+llm = None
+if not os.path.exists(model_path_gguf):
+     print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+     print(f"ERROR: GGUF Model file not found at: {model_path_gguf}")
+     print(f"Please download the model and place it at the correct path, or update model_path_gguf.")
+     print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+     print("LLM initialization skipped.")
+else:
+    try:
+        llm = CTransformers(model=model_path_gguf, model_type='llama', config=config)
+        print("Fine-Tuned CTransformers LLM initialized.")
+    except Exception as e:
+        print(f"Error initializing CTransformers: {e}")
+        print("LLM initialization failed.")
+        # Decide if you want to exit or continue without LLM
+        # exit()
+# --- RAG Setup ---
+def format_docs(docs):
+    # Expects a list of Document objects
+    return "\n\n".join(doc.page_content for doc in docs if isinstance(doc, Document))
+# --- RAG Function using ChromaDB ---
+def answer_resume_question(user_question):
+    """Answers questions using RAG with ChromaDB similarity search."""
+    k_limit = 5 # Number of documents to retrieve
+    print(f"\nReceived question: {user_question}")
+    if not vectorstore:
+        return "Error: Vector store is not available."
+    print(f"Performing similarity search (top {k_limit})...")
+    try:
+        # 1. Retrieve documents using ChromaDB similarity search
+        # Use similarity_search_with_score to get scores if needed for logging/debugging
+        # results_with_scores = vectorstore.similarity_search_with_score(user_question, k=k_limit)
+        # retrieved_docs = [doc for doc, score in results_with_scores]
+        # similarity_scores = [score for doc, score in results_with_scores]
+        # Or simpler retrieval if scores aren't needed immediately:
+        retrieved_docs = vectorstore.similarity_search(user_question, k=k_limit)
+        if not retrieved_docs:
+            print("No relevant documents found via similarity search.")
+            # Optionally add fallback logic here if needed
+            return "I couldn't find relevant information in the CV for your query."
+        print(f"Retrieved {len(retrieved_docs)} documents.")
+        # Log details of top retrieved docs
+        for i, doc in enumerate(retrieved_docs):
+            # score = similarity_scores[i] # Uncomment if using similarity_search_with_score
+            print(f"  -> Top {i+1} Doc (Cat: {doc.metadata.get('category')}, SrcID: {doc.metadata.get('source_doc_id')}) Content: {doc.page_content.replace(os.linesep, ' ')}...")
+        # 2. Combine content
+        combined_context = format_docs(retrieved_docs) # Use the existing format_docs
+        # 3. Check if LLM is available
+        if not llm:
+             return "LLM is not available, cannot generate a final answer. Relevant context found:\n\n" + combined_context
+        # 4. Final Answer Generation Step
+        qa_template = """
+Based *only* on the following context from Jaynil Jaiswal's CV, provide a detailed and comprehensive answer to the question.
+If the context does not contain the information needed to answer the question fully, please state that clearly using phrases like 'Based on the context provided, I cannot answer...' or 'The provided context does not contain information about...'.
+Do not make up any information or provide generic non-answers. You are free to selectively use sources from the context to answer the question.
+Context:
+{context}
+Question: {question}
+Answer:"""
+        qa_prompt = PromptTemplate.from_template(qa_template)
+        formatted_qa_prompt = qa_prompt.format(context=combined_context, question=user_question)
+        print("Generating final answer...")
+        answer = llm.invoke(formatted_qa_prompt).strip()
+        print(f"LLM Response: {answer}")
+        # Optional: Add the insufficient answer check here if desired
+        # if is_answer_insufficient(answer):
+        #     print("LLM answer seems insufficient...")
+        #     # Return fallback or the potentially insufficient answer based on preference
+        #     return FALLBACK_MESSAGE # Assuming FALLBACK_MESSAGE is defined
+    except Exception as e:
+        print(f"Error during RAG execution: {e}")
+        answer = "Sorry, I encountered an error while processing your question."
+    return answer
+# --- End Modification ---
+# --- Gradio Interface ---
+# (This part remains unchanged)
+iface = gr.Interface(
+    fn=answer_resume_question,
+    inputs=gr.Textbox(label="💬 Ask about my CV", placeholder="E.g. What was done at Oracle? List my projects.", lines=2),
+    outputs=gr.Textbox(label="💡 Answer", lines=8),
+    title="📚 CV RAG Chatbot (ChromaDB + Granular Docs)",
+    description="Ask questions about the CV! (Uses local GGUF model via CTransformers)",
+    theme="soft",
+    allow_flagging="never"
+)
+# --- Run Gradio ---
+if __name__ == "__main__":
+    print("Launching Gradio interface...")
+    # Make sure LLM was loaded successfully before launching
+    if vectorstore and llm:
+         iface.launch(server_name="0.0.0.0", server_port=7860)
+    elif not vectorstore:
+         print("Could not launch: Vector store failed to load.")
+    else: # LLM failed
+         print("Could not launch: LLM failed to load. Check model path and dependencies.")