import streamlit as st import os import tempfile from pathlib import Path import time from typing import List, Dict, Tuple import pandas as pd from streamlit.runtime.uploaded_file_manager import UploadedFile from anthropic import Anthropic import pymongo from dotenv import load_dotenv import fitz # PyMuPDF import voyageai from pinecone.grpc import PineconeGRPC as Pinecone from pinecone import ServerlessSpec from pinecone import Index # Load environment variables load_dotenv() # Initialize VoyageAI constants VOYAGEAI_BATCH_SIZE = 128 VOYAGEAI_VECTOR_DIM = 512 # Initialize Pinecone PINECONE_ID = "intratalent-v2" # Initialize MongoDB client MONGO_URI = os.getenv('MONGO_URI') mongo_client = pymongo.MongoClient(MONGO_URI) db = mongo_client['intratalent'] resume_collection = db['resumes'] # Initialize Anthropic client anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY')) # Initialize Streamlit app st.set_page_config( page_title="IntraTalent Resume Processor", page_icon="📄", layout="wide" ) def extract_text_from_pdf(pdf_content: bytes) -> str: """Extract text from PDF content.""" try: # Create a temporary file to store the PDF content with tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', delete=False) as temp_file: temp_file.write(pdf_content) temp_file_path = temp_file.name # Extract text from PDF doc = fitz.open(temp_file_path) text = "" for page_num in range(doc.page_count): page = doc.load_page(page_num) text += page.get_text() + "\n" doc.close() # Clean up temporary file os.unlink(temp_file_path) return text except Exception as e: st.error(f"Error extracting text from PDF: {e}") return "" def extract_info_with_claude(resume_text: str) -> str: """Extract information from resume text using Claude.""" st.write("🤖 Sending request to Claude API...") prompt = """ Extract the following information from the given resume: 1. Full Name 2. List of all experiences with their descriptions (copy exactly from resume) Please format the output as follows: Name: [Full Name] Projects: 1. [Experience/Project Name]: [Experience/Project Description] 2. [Experience/Project Name]: [Experience/Project Description] ... Extract all experiences, including projects, leadership, work experience, research, etc. Don't include hyphens and put the entire description on one line. Here's the resume text: {resume_text} """.format(resume_text=resume_text) try: message = anthropic.messages.create( model="claude-3-haiku-20240307", max_tokens=4096, system="You are a helpful assistant that extracts information from resumes.", messages=[{ "role": "user", "content": prompt }] ) extracted_info = message.content[0].text st.write("✅ Received response from Claude:") st.code(extracted_info, language="text") except Exception as e: extracted_info = f"An error occurred: {e}" st.error(f"❌ API Error: {e}") return extracted_info def get_pinecone_index(database_id: str) -> Index: # initialize connection to pinecone pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) # if the index does not exist, we create it if not database_id in pc.list_indexes(): pc.create_index( database_id, dimension=VOYAGEAI_VECTOR_DIM, spec=ServerlessSpec( cloud='aws', region='us-east-1' ), metric='cosine' ) # connect to index index = pc.Index(database_id) def add_to_voyage(person_name: str, person_projects: list) -> None: embeds = [] metas = [] ids = [] index = get_pinecone_index(PINECONE_ID) vo = voyageai.Client(api_key=os.getenv('VOYAGEAI_API_KEY')) for i in range(len(person_projects)): # Get the ith project project = person_projects[i] # Embed the description embed = vo.embed( texts=project["description"], model='voyage-3-lite', truncation=False ).embeddings[0] embeds.append(embed) # Create metadata using person's name + project name meta = f"{person_name} {project['name']}" metas.append(meta) # Give it a unique id id = i ids.append(i) # create list of (id, vector, metadata) tuples to be upserted to_upsert = list(zip(ids, embeds, meta)) for i in range(0, len(ids), VOYAGEAI_BATCH_SIZE): i_end = min(i+VOYAGEAI_BATCH_SIZE, len(ids)) index.upsert(vectors=to_upsert[i:i_end]) # let's view the index statistics st.write(index.describe_index_stats()) def parse_resume(uploaded_file: UploadedFile) -> Tuple[str, List[Dict]]: """Parse a resume file and return name and projects.""" try: st.write(f"📝 Processing resume: {uploaded_file.name}") resume_content = uploaded_file.getvalue() st.write("📊 Extracting text from PDF...") resume_text = extract_text_from_pdf(resume_content) st.write("📄 Extracted text from PDF:") st.code(resume_text) extracted_info = extract_info_with_claude(resume_text) st.write("🔍 Parsing extracted information...") # Parse the extracted information lines = extracted_info.split('\n') name = lines[0].split(': ')[1] if len(lines) > 0 and ': ' in lines[0] else "Unknown" st.write(f"👤 Extracted name: {name}") projects = [] project_started = False for line in lines: if line.strip() == "Projects:": project_started = True continue if project_started and line.strip(): project_parts = line.split(': ', 1) if len(project_parts) == 2: project_name = project_parts[0].split('. ', 1)[-1] # Remove the number project_description = project_parts[1] projects.append({"name": project_name, "description": project_description}) st.write("📋 Extracted projects:") st.json(projects) # Store in MongoDB resume_data = { "name": name, "projects": projects, "full_content": resume_text } add_to_voyage(name, projects) st.write("💾 Stored data in VoyageAI") return name, projects except Exception as e: st.error(f"❌ Error processing resume: {e}") return "Unknown", [] def process_resumes(uploaded_files: List[UploadedFile]) -> Dict: """Process multiple resumes and return results.""" results = {} progress_bar = st.progress(0) for idx, file in enumerate(uploaded_files): st.write(f"\n---\n### Processing file {idx + 1} of {len(uploaded_files)}") if file.type != "application/pdf": st.warning(f"⚠ī¸ Skipping {file.name}: Not a PDF file") continue try: name, projects = parse_resume(file) results[file.name] = { "name": name, "projects": projects } # Update progress progress_bar.progress((idx + 1) / len(uploaded_files)) st.write(f"✅ Successfully processed {file.name}") except Exception as e: st.error(f"❌ Error processing {file.name}: {e}") return results def display_results(results: Dict): """Display processed resume results in an organized manner.""" if not results: return st.subheader("📊 Processed Resumes") for filename, data in results.items(): with st.expander(f"📄 {data['name']} ({filename})"): st.write("🏷ī¸ File details:") st.json({ "filename": filename, "name": data['name'], "number_of_projects": len(data['projects']) }) if data['projects']: st.write("📋 Projects:") df = pd.DataFrame(data['projects']) st.dataframe( df, column_config={ "name": "Project Name", "description": "Description" }, hide_index=True ) else: st.info("ℹī¸ No projects found in this resume") def main(): st.title("đŸŽ¯ IntraTalent Resume Processor") # File uploader section st.header("📤 Upload Resumes") uploaded_files = st.file_uploader( "Upload up to 10 resumes (PDF only)", type=['pdf'], accept_multiple_files=True, key="resume_uploader" ) # Validate number of files if uploaded_files and len(uploaded_files) > 10: st.error("⚠ī¸ Maximum 10 files allowed. Please remove some files.") return # Process button if uploaded_files and st.button("🔄 Process Resumes"): with st.spinner("Processing resumes..."): st.write("🚀 Starting resume processing...") results = process_resumes(uploaded_files) st.session_state['processed_results'] = results st.write("✨ Processing complete!") display_results(results) # Query section st.header("🔍 Search Projects") query = st.text_area( "Enter your project requirements", placeholder="Example: Looking for team members with experience in machine learning and computer vision...", height=100 ) if query and st.button("🔎 Search"): if 'processed_results' not in st.session_state: st.warning("⚠ī¸ Please process some resumes first!") return with st.spinner("Searching for matches..."): st.write("🔄 Preparing to search...") # Here you would implement the embedding and similarity search # Using the code from your original script st.success("✅ Search completed!") # Display results in a nice format st.subheader("đŸŽ¯ Top Matches") # Placeholder for search results st.info("🔜 Feature coming soon: Will display matching projects and candidates based on similarity search") if __name__ == "__main__": main()