import os import re import streamlit as st from PIL import Image from transformers import pipeline from pdfminer.high_level import extract_text # Load and cache the OCR model once at startup. @st.cache_resource(show_spinner=False) def load_ocr_pipeline(): try: # Initialize the OCR pipeline from transformers. Change the model as needed. ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf") return ocr_pipe except Exception as e: st.error(f"Error loading model: {e}") st.stop() ocr_pipeline = load_ocr_pipeline() st.write("Model loaded successfully!") ##################################### # Text Extraction Function ##################################### def extract_text_from_file(file_obj): full_text = "" file_extension = os.path.splitext(file_obj.name)[1].lower() if file_extension == ".pdf": try: # Use pdfminer.six to extract text from PDF files. full_text = extract_text(file_obj) except Exception as e: full_text = f"Error processing PDF: {e}" else: try: img = Image.open(file_obj) result = ocr_pipeline(img) if isinstance(result, list) and "text" in result[0]: full_text = result[0]["text"] except Exception as e: full_text = f"Error processing image: {e}" return full_text ##################################### # Resume Information Extraction Functions ##################################### def extract_basic_resume_info(text): """ Extract basic resume information, such as: - Name - Age - Job Experience - Skills - Expected Industry/Direction """ info = { "Name": None, "Age": None, "Job Experience": None, "Skills": None, "Expected Industry/Direction": None, } # Extract Name: trigger words like 'Name:' name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text) if name_match: info["Name"] = name_match.group(1).strip() else: # Fallback: heuristic for sequences of capitalized words. potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text) if potential_names: info["Name"] = potential_names[0] # Extract Age: age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text) if age_match: info["Age"] = age_match.group(1) # Extract Job Experience (years) exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE) if exp_match: info["Job Experience"] = exp_match.group(1) + " years" else: exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE) if exp_line: info["Job Experience"] = exp_line.group(2).strip() # Extract Skills (e.g., "Skills: Python, Java, SQL") skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text) if skills_match: skills_text = skills_match.group(1) skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()] info["Skills"] = skills # Extract Expected Industry/Direction (e.g., "Interest: Software Development") industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE) if industry_match: info["Expected Industry/Direction"] = industry_match.group(2).strip() return info ##################################### # Main Resume Processing Logic ##################################### def process_resume(file_obj): if file_obj is None: return None, None # Extract text based on file type (PDF or image). resume_text = extract_text_from_file(file_obj) # Parse basic resume details from the extracted text. resume_info = extract_basic_resume_info(resume_text) return resume_text, resume_info ##################################### # Streamlit Interface ##################################### st.title("Resume Extraction and Basic Info Parsing") st.markdown(""" Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information. """) uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"]) if st.button("Extract Info"): if uploaded_file is None: st.error("Please upload a file first.") else: with st.spinner("Processing..."): resume_text, resume_info = process_resume(uploaded_file) st.subheader("Extracted Resume Text") st.text_area("", resume_text, height=200) st.subheader("Parsed Basic Resume Information") st.json(resume_info)