Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import tempfile | |
from pathlib import Path | |
import time | |
from typing import List, Dict, Tuple | |
import pandas as pd | |
from streamlit.runtime.uploaded_file_manager import UploadedFile | |
from anthropic import Anthropic | |
import pymongo | |
from dotenv import load_dotenv | |
import fitz # PyMuPDF | |
import voyageai | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from pinecone import ServerlessSpec | |
from pinecone import Index | |
# Load environment variables | |
load_dotenv() | |
# Initialize VoyageAI constants | |
VOYAGEAI_BATCH_SIZE = 128 | |
VOYAGEAI_VECTOR_DIM = 512 | |
# Initialize Pinecone | |
PINECONE_ID = "intratalent-v2" | |
# Initialize MongoDB client | |
MONGO_URI = os.getenv('MONGO_URI') | |
mongo_client = pymongo.MongoClient(MONGO_URI) | |
db = mongo_client['intratalent'] | |
resume_collection = db['resumes'] | |
# Initialize Anthropic client | |
anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY')) | |
# Initialize Streamlit app | |
st.set_page_config( | |
page_title="IntraTalent Resume Processor", | |
page_icon="π", | |
layout="wide" | |
) | |
def extract_text_from_pdf(pdf_content: bytes) -> str: | |
"""Extract text from PDF content.""" | |
try: | |
# Create a temporary file to store the PDF content | |
with tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', delete=False) as temp_file: | |
temp_file.write(pdf_content) | |
temp_file_path = temp_file.name | |
# Extract text from PDF | |
doc = fitz.open(temp_file_path) | |
text = "" | |
for page_num in range(doc.page_count): | |
page = doc.load_page(page_num) | |
text += page.get_text() + "\n" | |
doc.close() | |
# Clean up temporary file | |
os.unlink(temp_file_path) | |
return text | |
except Exception as e: | |
st.error(f"Error extracting text from PDF: {e}") | |
return "" | |
def extract_info_with_claude(resume_text: str) -> str: | |
"""Extract information from resume text using Claude.""" | |
st.write("π€ Sending request to Claude API...") | |
prompt = """ | |
Extract the following information from the given resume: | |
1. Full Name | |
2. List of all experiences with their descriptions (copy exactly from resume) | |
Please format the output as follows: | |
Name: [Full Name] | |
Projects: | |
1. [Experience/Project Name]: [Experience/Project Description] | |
2. [Experience/Project Name]: [Experience/Project Description] | |
... | |
Extract all experiences, including projects, leadership, work experience, research, etc. Don't include hyphens and put the entire description on one line. | |
Here's the resume text: | |
{resume_text} | |
""".format(resume_text=resume_text) | |
try: | |
message = anthropic.messages.create( | |
model="claude-3-haiku-20240307", | |
max_tokens=4096, | |
system="You are a helpful assistant that extracts information from resumes.", | |
messages=[{ | |
"role": "user", | |
"content": prompt | |
}] | |
) | |
extracted_info = message.content[0].text | |
st.write("β Received response from Claude:") | |
st.code(extracted_info, language="text") | |
except Exception as e: | |
extracted_info = f"An error occurred: {e}" | |
st.error(f"β API Error: {e}") | |
return extracted_info | |
def get_pinecone_index(database_id: str) -> Index: | |
# initialize connection to pinecone | |
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) | |
# if the index does not exist, we create it | |
if not database_id in pc.list_indexes(): | |
pc.create_index( | |
database_id, | |
dimension=VOYAGEAI_VECTOR_DIM, | |
spec=ServerlessSpec( | |
cloud='aws', | |
region='us-east-1' | |
), | |
metric='cosine' | |
) | |
# connect to index | |
index = pc.Index(database_id) | |
def add_to_voyage(person_name: str, person_projects: list) -> None: | |
embeds = [] | |
metas = [] | |
ids = [] | |
index = get_pinecone_index(PINECONE_ID) | |
vo = voyageai.Client(api_key=os.getenv('VOYAGEAI_API_KEY')) | |
for i in range(len(person_projects)): | |
# Get the ith project | |
project = person_projects[i] | |
# Embed the description | |
embed = vo.embed( | |
texts=project["description"], | |
model='voyage-3-lite', | |
truncation=False | |
).embeddings[0] | |
embeds.append(embed) | |
# Create metadata using person's name + project name | |
meta = f"{person_name} {project['name']}" | |
metas.append(meta) | |
# Give it a unique id | |
id = i | |
ids.append(i) | |
# create list of (id, vector, metadata) tuples to be upserted | |
to_upsert = list(zip(ids, embeds, meta)) | |
for i in range(0, len(ids), VOYAGEAI_BATCH_SIZE): | |
i_end = min(i+VOYAGEAI_BATCH_SIZE, len(ids)) | |
index.upsert(vectors=to_upsert[i:i_end]) | |
# let's view the index statistics | |
st.write(index.describe_index_stats()) | |
def parse_resume(uploaded_file: UploadedFile) -> Tuple[str, List[Dict]]: | |
"""Parse a resume file and return name and projects.""" | |
try: | |
st.write(f"π Processing resume: {uploaded_file.name}") | |
resume_content = uploaded_file.getvalue() | |
st.write("π Extracting text from PDF...") | |
resume_text = extract_text_from_pdf(resume_content) | |
st.write("π Extracted text from PDF:") | |
st.code(resume_text) | |
extracted_info = extract_info_with_claude(resume_text) | |
st.write("π Parsing extracted information...") | |
# Parse the extracted information | |
lines = extracted_info.split('\n') | |
name = lines[0].split(': ')[1] if len(lines) > 0 and ': ' in lines[0] else "Unknown" | |
st.write(f"π€ Extracted name: {name}") | |
projects = [] | |
project_started = False | |
for line in lines: | |
if line.strip() == "Projects:": | |
project_started = True | |
continue | |
if project_started and line.strip(): | |
project_parts = line.split(': ', 1) | |
if len(project_parts) == 2: | |
project_name = project_parts[0].split('. ', 1)[-1] # Remove the number | |
project_description = project_parts[1] | |
projects.append({"name": project_name, "description": project_description}) | |
st.write("π Extracted projects:") | |
st.json(projects) | |
# Store in MongoDB | |
resume_data = { | |
"name": name, | |
"projects": projects, | |
"full_content": resume_text | |
} | |
add_to_voyage(name, projects) | |
st.write("πΎ Stored data in VoyageAI") | |
return name, projects | |
except Exception as e: | |
st.error(f"β Error processing resume: {e}") | |
return "Unknown", [] | |
def process_resumes(uploaded_files: List[UploadedFile]) -> Dict: | |
"""Process multiple resumes and return results.""" | |
results = {} | |
progress_bar = st.progress(0) | |
for idx, file in enumerate(uploaded_files): | |
st.write(f"\n---\n### Processing file {idx + 1} of {len(uploaded_files)}") | |
if file.type != "application/pdf": | |
st.warning(f"β οΈ Skipping {file.name}: Not a PDF file") | |
continue | |
try: | |
name, projects = parse_resume(file) | |
results[file.name] = { | |
"name": name, | |
"projects": projects | |
} | |
# Update progress | |
progress_bar.progress((idx + 1) / len(uploaded_files)) | |
st.write(f"β Successfully processed {file.name}") | |
except Exception as e: | |
st.error(f"β Error processing {file.name}: {e}") | |
return results | |
def display_results(results: Dict): | |
"""Display processed resume results in an organized manner.""" | |
if not results: | |
return | |
st.subheader("π Processed Resumes") | |
for filename, data in results.items(): | |
with st.expander(f"π {data['name']} ({filename})"): | |
st.write("π·οΈ File details:") | |
st.json({ | |
"filename": filename, | |
"name": data['name'], | |
"number_of_projects": len(data['projects']) | |
}) | |
if data['projects']: | |
st.write("π Projects:") | |
df = pd.DataFrame(data['projects']) | |
st.dataframe( | |
df, | |
column_config={ | |
"name": "Project Name", | |
"description": "Description" | |
}, | |
hide_index=True | |
) | |
else: | |
st.info("βΉοΈ No projects found in this resume") | |
def main(): | |
st.title("π― IntraTalent Resume Processor") | |
# File uploader section | |
st.header("π€ Upload Resumes") | |
uploaded_files = st.file_uploader( | |
"Upload up to 10 resumes (PDF only)", | |
type=['pdf'], | |
accept_multiple_files=True, | |
key="resume_uploader" | |
) | |
# Validate number of files | |
if uploaded_files and len(uploaded_files) > 10: | |
st.error("β οΈ Maximum 10 files allowed. Please remove some files.") | |
return | |
# Process button | |
if uploaded_files and st.button("π Process Resumes"): | |
with st.spinner("Processing resumes..."): | |
st.write("π Starting resume processing...") | |
results = process_resumes(uploaded_files) | |
st.session_state['processed_results'] = results | |
st.write("β¨ Processing complete!") | |
display_results(results) | |
# Query section | |
st.header("π Search Projects") | |
query = st.text_area( | |
"Enter your project requirements", | |
placeholder="Example: Looking for team members with experience in machine learning and computer vision...", | |
height=100 | |
) | |
if query and st.button("π Search"): | |
if 'processed_results' not in st.session_state: | |
st.warning("β οΈ Please process some resumes first!") | |
return | |
with st.spinner("Searching for matches..."): | |
st.write("π Preparing to search...") | |
# Here you would implement the embedding and similarity search | |
# Using the code from your original script | |
st.success("β Search completed!") | |
# Display results in a nice format | |
st.subheader("π― Top Matches") | |
# Placeholder for search results | |
st.info("π Feature coming soon: Will display matching projects and candidates based on similarity search") | |
if __name__ == "__main__": | |
main() |