Spaces:
Sleeping
Sleeping
File size: 4,515 Bytes
cf8a522 8e1d297 9753cc9 8e1d297 9753cc9 586dcd2 8e1d297 9753cc9 8e1d297 9753cc9 8e1d297 586dcd2 8e1d297 6088e9d 8e1d297 6088e9d 8e1d297 6088e9d 8e1d297 6088e9d 8e1d297 6088e9d 8e1d297 9753cc9 8e1d297 9753cc9 586dcd2 6088e9d 8e1d297 586dcd2 8e1d297 586dcd2 8e1d297 9753cc9 8e1d297 9753cc9 8e1d297 6088e9d 8e1d297 9753cc9 8e1d297 6088e9d 8e1d297 586dcd2 6088e9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import os
import re
import streamlit as st
from PIL import Image
from transformers import pipeline
from pdfminer.high_level import extract_text
# Load and cache the OCR model once at startup
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
try:
# Ensure your transformers library is up-to-date (>=4.x)
ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
return ocr_pipe
except Exception as e:
st.error(f"Error loading model: {e}")
st.stop()
ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")
#####################################
# Extract Text from File Function
#####################################
def extract_text_from_file(file_obj):
full_text = ""
file_extension = os.path.splitext(file_obj.name)[1].lower()
if file_extension == ".pdf":
try:
# Use pdfminer.six to extract text from PDF files.
full_text = extract_text(file_obj)
except Exception as e:
full_text = f"Error processing PDF: {e}"
else:
try:
img = Image.open(file_obj)
result = ocr_pipeline(img)
if isinstance(result, list) and "text" in result[0]:
full_text = result[0]["text"]
except Exception as e:
full_text = f"Error processing image: {e}"
return full_text
#####################################
# Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
"""Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
info = {
"Name": None,
"Age": None,
"Job Experience": None,
"Skills": None,
"Expected Industry/Direction": None,
}
# Extract name (e.g., "Name: John Doe")
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
if name_match:
info["Name"] = name_match.group(1).strip()
else:
# Heuristic: pick the first sequence of capitalized words
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
if potential_names:
info["Name"] = potential_names[0]
# Extract age
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
if age_match:
info["Age"] = age_match.group(1)
# Extract job experience (years)
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
if exp_match:
info["Job Experience"] = exp_match.group(1) + " years"
else:
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
if exp_line:
info["Job Experience"] = exp_line.group(2).strip()
# Extract skills (e.g., "Skills: Python, Java, SQL")
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
if skills_match:
skills_text = skills_match.group(1)
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
info["Skills"] = skills
# Extract expected industry/direction
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
if industry_match:
info["Expected Industry/Direction"] = industry_match.group(2).strip()
return info
#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
if file_obj is None:
return None, None
# Extract text based on file type (PDF or image)
resume_text = extract_text_from_file(file_obj)
# Parse basic resume info using heuristics
resume_info = extract_basic_resume_info(resume_text)
return resume_text, resume_info
#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
""")
uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
if st.button("Extract Info"):
if uploaded_file is None:
st.error("Please upload a file first.")
else:
with st.spinner("Processing..."):
resume_text, resume_info = process_resume(uploaded_file)
st.subheader("Extracted Resume Text")
st.text_area("", resume_text, height=200)
st.subheader("Parsed Basic Resume Information")
st.json(resume_info) |