Spaces:
Sleeping
Sleeping
import os | |
import re | |
import torch # Explicitly imported if you want to use torch directly | |
import tempfile | |
from io import BytesIO | |
import streamlit as st | |
from PIL import Image | |
from transformers import pipeline | |
from pdf2image import convert_from_bytes | |
##################################### | |
# Load the OCR Pipeline (Uses Torch) | |
##################################### | |
try: | |
# Make sure that you're using an updated version of the transformers library (>=4.x) | |
ocr_pipeline = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf") | |
st.write("Model loaded successfully!") | |
except Exception as e: | |
st.error(f"Error loading model: {e}") | |
st.stop() | |
##################################### | |
# Utility: Convert PDF to Images | |
##################################### | |
def convert_pdf_to_images(pdf_bytes): | |
try: | |
images = convert_from_bytes(pdf_bytes) | |
return images | |
except Exception as e: | |
st.error(f"PDF conversion error: {e}") | |
return [] | |
##################################### | |
# Pipeline: Extract Text with OCR Pipeline | |
##################################### | |
def extract_text_from_file(file_obj): | |
file_extension = os.path.splitext(file_obj.name)[1].lower() | |
full_text = "" | |
if file_extension == ".pdf": | |
file_bytes = file_obj.read() | |
images = convert_pdf_to_images(file_bytes) | |
for img in images: | |
result = ocr_pipeline(img) | |
if isinstance(result, list) and "text" in result[0]: | |
full_text += result[0]["text"] + "\n" | |
else: | |
try: | |
img = Image.open(file_obj) | |
result = ocr_pipeline(img) | |
if isinstance(result, list) and "text" in result[0]: | |
full_text = result[0]["text"] | |
except Exception as e: | |
full_text = f"Error processing image: {e}" | |
return full_text | |
##################################### | |
# Information Extraction Functions | |
##################################### | |
def extract_resume_info(text): | |
info = { | |
"Name": None, | |
"Age": None, | |
"Job Experience": None, | |
"Skills": None, | |
"Expected Industry/Direction": None, | |
} | |
# Extract name, e.g., "Name: John Doe" | |
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text) | |
if name_match: | |
info["Name"] = name_match.group(1).strip() | |
else: | |
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text) | |
if potential_names: | |
info["Name"] = potential_names[0] | |
# Extract age | |
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text) | |
if age_match: | |
info["Age"] = age_match.group(1) | |
# Extract job experience (years) | |
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE) | |
if exp_match: | |
info["Job Experience"] = exp_match.group(1) + " years" | |
else: | |
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE) | |
if exp_line: | |
info["Job Experience"] = exp_line.group(2).strip() | |
# Extract skills | |
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text) | |
if skills_match: | |
skills_text = skills_match.group(1) | |
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()] | |
info["Skills"] = skills | |
# Extract expected industry/direction | |
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE) | |
if industry_match: | |
info["Expected Industry/Direction"] = industry_match.group(2).strip() | |
return info | |
##################################### | |
# Candidate Comparison Function | |
##################################### | |
def compare_candidate_with_company(resume_info, company_requirements): | |
candidate_industry = resume_info.get("Expected Industry/Direction", "") | |
candidate_keywords = set(candidate_industry.lower().split()) | |
company_keywords = set(company_requirements.lower().split()) | |
common = candidate_keywords.intersection(company_keywords) | |
suitable = len(common) > 0 | |
# Check skills matching if available | |
if resume_info.get("Skills"): | |
candidate_skills = {skill.lower() for skill in resume_info["Skills"]} | |
company_skills = set(company_requirements.lower().split()) | |
common_skills = candidate_skills.intersection(company_skills) | |
if len(common_skills) >= 1: | |
suitable = True | |
return { | |
"Common Keywords": list(common) if common else [], | |
"Suitable": "Yes" if suitable else "No" | |
} | |
##################################### | |
# Main Processing Logic | |
##################################### | |
def process_resume(file_obj, company_requirements): | |
if file_obj is None: | |
return None, None, None | |
resume_text = extract_text_from_file(file_obj) | |
resume_info = extract_resume_info(resume_text) | |
comparison = compare_candidate_with_company(resume_info, company_requirements) | |
return resume_text, resume_info, comparison | |
##################################### | |
# Streamlit UI | |
##################################### | |
st.title("Resume Extraction and Candidate Matching") | |
st.markdown(""" | |
This app uses an image-to-text pipeline (powered by `YouLiXiya/tinyllava-v1.0-1.1b-hf` and PyTorch) to | |
extract details from uploaded resume files (PDF or image formats). It then parses critical candidate | |
information and compares it against company requirements. | |
""") | |
uploaded_file = st.file_uploader("Upload Resume (PDF or Image)", type=["pdf", "png", "jpg", "jpeg"]) | |
company_requirements = st.text_input("Enter Company Requirements/Criteria (e.g., industry, skills)", | |
placeholder="Example: Technology, Python, Software Development") | |
if st.button("Process Resume"): | |
if uploaded_file is None: | |
st.error("Please upload a file first.") | |
else: | |
with st.spinner("Processing..."): | |
resume_text, resume_info, comparison = process_resume(uploaded_file, company_requirements) | |
st.subheader("Extracted Resume Text") | |
st.text_area("", resume_text, height=200) | |
st.subheader("Parsed Resume Information") | |
st.json(resume_info) | |
st.subheader("Comparison with Company Requirements") | |
st.json(comparison) |