Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

File size: 4,515 Bytes

import os
import re
import streamlit as st
from PIL import Image
from transformers import pipeline
from pdfminer.high_level import extract_text

# Load and cache the OCR model once at startup
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
    try:
        # Ensure your transformers library is up-to-date (>=4.x)
        ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
        return ocr_pipe
    except Exception as e:
        st.error(f"Error loading model: {e}")
        st.stop()

ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")

#####################################
# Extract Text from File Function
#####################################
def extract_text_from_file(file_obj):
    full_text = ""
    file_extension = os.path.splitext(file_obj.name)[1].lower()
    
    if file_extension == ".pdf":
        try:
            # Use pdfminer.six to extract text from PDF files.
            full_text = extract_text(file_obj)
        except Exception as e:
            full_text = f"Error processing PDF: {e}"
    else:
        try:
            img = Image.open(file_obj)
            result = ocr_pipeline(img)
            if isinstance(result, list) and "text" in result[0]:
                full_text = result[0]["text"]
        except Exception as e:
            full_text = f"Error processing image: {e}"
    return full_text

#####################################
# Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
    """Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
    info = {
        "Name": None,
        "Age": None,
        "Job Experience": None,
        "Skills": None,
        "Expected Industry/Direction": None,
    }

    # Extract name (e.g., "Name: John Doe")
    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
    if name_match:
        info["Name"] = name_match.group(1).strip()
    else:
        # Heuristic: pick the first sequence of capitalized words
        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
        if potential_names:
            info["Name"] = potential_names[0]

    # Extract age
    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
    if age_match:
        info["Age"] = age_match.group(1)

    # Extract job experience (years)
    exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
    if exp_match:
        info["Job Experience"] = exp_match.group(1) + " years"
    else:
        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
        if exp_line:
            info["Job Experience"] = exp_line.group(2).strip()

    # Extract skills (e.g., "Skills: Python, Java, SQL")
    skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
        info["Skills"] = skills

    # Extract expected industry/direction
    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
    if industry_match:
        info["Expected Industry/Direction"] = industry_match.group(2).strip()

    return info

#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
    if file_obj is None:
        return None, None

    # Extract text based on file type (PDF or image)
    resume_text = extract_text_from_file(file_obj)
    # Parse basic resume info using heuristics
    resume_info = extract_basic_resume_info(resume_text)
    return resume_text, resume_info

#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
""")

uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])

if st.button("Extract Info"):
    if uploaded_file is None:
        st.error("Please upload a file first.")
    else:
        with st.spinner("Processing..."):
            resume_text, resume_info = process_resume(uploaded_file)
        st.subheader("Extracted Resume Text")
        st.text_area("", resume_text, height=200)
        st.subheader("Parsed Basic Resume Information")
        st.json(resume_info)