CR7CAD's picture
Update app.py
294af95 verified
raw
history blame
4.29 kB
import os
import re
import torch # Explicitly imported if you want to use torch directly
from io import BytesIO
import streamlit as st
from PIL import Image
from transformers import pipeline
# Use st.cache_resource (Streamlit 1.18+) to load and cache the OCR pipeline once
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
try:
# Ensure your transformers library is up-to-date (>=4.x)
ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
return ocr_pipe
except Exception as e:
st.error(f"Error loading model: {e}")
st.stop()
# Load the model at startup
ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")
#####################################
# Pipeline: Extract Text with OCR Pipeline
#####################################
def extract_text_from_file(file_obj):
full_text = ""
try:
img = Image.open(file_obj)
result = ocr_pipeline(img)
if isinstance(result, list) and "text" in result[0]:
full_text = result[0]["text"]
except Exception as e:
full_text = f"Error processing image: {e}"
return full_text
#####################################
# Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
"""Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
info = {
"Name": None,
"Age": None,
"Job Experience": None,
"Skills": None,
"Expected Industry/Direction": None,
}
# Extract name (e.g., "Name: John Doe")
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
if name_match:
info["Name"] = name_match.group(1).strip()
else:
# Heuristic: pick the first sequence of capitalized words
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
if potential_names:
info["Name"] = potential_names[0]
# Extract age
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
if age_match:
info["Age"] = age_match.group(1)
# Extract job experience (years)
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
if exp_match:
info["Job Experience"] = exp_match.group(1) + " years"
else:
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
if exp_line:
info["Job Experience"] = exp_line.group(2).strip()
# Extract skills (e.g., "Skills: Python, Java, SQL")
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
if skills_match:
skills_text = skills_match.group(1)
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
info["Skills"] = skills
# Extract expected industry/direction
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
if industry_match:
info["Expected Industry/Direction"] = industry_match.group(2).strip()
return info
#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
if file_obj is None:
return None, None
# Extract text using only the image-based OCR pipeline
resume_text = extract_text_from_file(file_obj)
# Parse basic resume info
resume_info = extract_basic_resume_info(resume_text)
return resume_text, resume_info
#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload an image file (PNG, JPG, or JPEG) to extract basic text and candidate information.
""")
uploaded_file = st.file_uploader("Upload Resume (Image Only)", type=["png", "jpg", "jpeg"])
if st.button("Extract Info"):
if uploaded_file is None:
st.error("Please upload an image file first.")
else:
with st.spinner("Processing..."):
resume_text, resume_info = process_resume(uploaded_file)
st.subheader("Extracted Resume Text")
st.text_area("", resume_text, height=200)
st.subheader("Parsed Basic Resume Information")
st.json(resume_info)