import os import re import torch # Explicitly imported if you want to use torch directly from io import BytesIO import streamlit as st from PIL import Image from transformers import pipeline # Use st.cache_resource (Streamlit 1.18+) to load and cache the OCR pipeline once @st.cache_resource(show_spinner=False) def load_ocr_pipeline(): try: # Ensure your transformers library is up-to-date (>=4.x) ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf") return ocr_pipe except Exception as e: st.error(f"Error loading model: {e}") st.stop() # Load the model at startup ocr_pipeline = load_ocr_pipeline() st.write("Model loaded successfully!") ##################################### # Pipeline: Extract Text with OCR Pipeline ##################################### def extract_text_from_file(file_obj): full_text = "" try: img = Image.open(file_obj) result = ocr_pipeline(img) if isinstance(result, list) and "text" in result[0]: full_text = result[0]["text"] except Exception as e: full_text = f"Error processing image: {e}" return full_text ##################################### # Information Extraction Functions ##################################### def extract_basic_resume_info(text): """Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction.""" info = { "Name": None, "Age": None, "Job Experience": None, "Skills": None, "Expected Industry/Direction": None, } # Extract name (e.g., "Name: John Doe") name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text) if name_match: info["Name"] = name_match.group(1).strip() else: # Heuristic: pick the first sequence of capitalized words potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text) if potential_names: info["Name"] = potential_names[0] # Extract age age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text) if age_match: info["Age"] = age_match.group(1) # Extract job experience (years) exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE) if exp_match: info["Job Experience"] = exp_match.group(1) + " years" else: exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE) if exp_line: info["Job Experience"] = exp_line.group(2).strip() # Extract skills (e.g., "Skills: Python, Java, SQL") skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text) if skills_match: skills_text = skills_match.group(1) skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()] info["Skills"] = skills # Extract expected industry/direction industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE) if industry_match: info["Expected Industry/Direction"] = industry_match.group(2).strip() return info ##################################### # Main Processing Logic ##################################### def process_resume(file_obj): if file_obj is None: return None, None # Extract text using only the image-based OCR pipeline resume_text = extract_text_from_file(file_obj) # Parse basic resume info resume_info = extract_basic_resume_info(resume_text) return resume_text, resume_info ##################################### # Streamlit Interface ##################################### st.title("Resume Extraction and Basic Info Parsing") st.markdown(""" Upload an image file (PNG, JPG, or JPEG) to extract basic text and candidate information. """) uploaded_file = st.file_uploader("Upload Resume (Image Only)", type=["png", "jpg", "jpeg"]) if st.button("Extract Info"): if uploaded_file is None: st.error("Please upload an image file first.") else: with st.spinner("Processing..."): resume_text, resume_info = process_resume(uploaded_file) st.subheader("Extracted Resume Text") st.text_area("", resume_text, height=200) st.subheader("Parsed Basic Resume Information") st.json(resume_info)