Spaces:
Sleeping
Sleeping
File size: 4,660 Bytes
cf8a522 8e1d297 9753cc9 8e1d297 08361f0 586dcd2 08361f0 586dcd2 8e1d297 08361f0 8e1d297 9753cc9 8e1d297 08361f0 8e1d297 586dcd2 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 08361f0 8e1d297 6088e9d 8e1d297 6088e9d 8e1d297 08361f0 8e1d297 08361f0 586dcd2 6088e9d 8e1d297 586dcd2 8e1d297 586dcd2 8e1d297 08361f0 8e1d297 9753cc9 8e1d297 6088e9d 8e1d297 9753cc9 8e1d297 6088e9d 8e1d297 586dcd2 6088e9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import re
import streamlit as st
from PIL import Image
from transformers import pipeline
from pdfminer.high_level import extract_text
# Load and cache the OCR model once at startup.
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
try:
# Initialize the OCR pipeline from transformers. Change the model as needed.
ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
return ocr_pipe
except Exception as e:
st.error(f"Error loading model: {e}")
st.stop()
ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")
#####################################
# Text Extraction Function
#####################################
def extract_text_from_file(file_obj):
full_text = ""
file_extension = os.path.splitext(file_obj.name)[1].lower()
if file_extension == ".pdf":
try:
# Use pdfminer.six to extract text from PDF files.
full_text = extract_text(file_obj)
except Exception as e:
full_text = f"Error processing PDF: {e}"
else:
try:
img = Image.open(file_obj)
result = ocr_pipeline(img)
if isinstance(result, list) and "text" in result[0]:
full_text = result[0]["text"]
except Exception as e:
full_text = f"Error processing image: {e}"
return full_text
#####################################
# Resume Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
"""
Extract basic resume information, such as:
- Name
- Age
- Job Experience
- Skills
- Expected Industry/Direction
"""
info = {
"Name": None,
"Age": None,
"Job Experience": None,
"Skills": None,
"Expected Industry/Direction": None,
}
# Extract Name: trigger words like 'Name:'
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
if name_match:
info["Name"] = name_match.group(1).strip()
else:
# Fallback: heuristic for sequences of capitalized words.
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
if potential_names:
info["Name"] = potential_names[0]
# Extract Age:
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
if age_match:
info["Age"] = age_match.group(1)
# Extract Job Experience (years)
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
if exp_match:
info["Job Experience"] = exp_match.group(1) + " years"
else:
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
if exp_line:
info["Job Experience"] = exp_line.group(2).strip()
# Extract Skills (e.g., "Skills: Python, Java, SQL")
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
if skills_match:
skills_text = skills_match.group(1)
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
info["Skills"] = skills
# Extract Expected Industry/Direction (e.g., "Interest: Software Development")
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
if industry_match:
info["Expected Industry/Direction"] = industry_match.group(2).strip()
return info
#####################################
# Main Resume Processing Logic
#####################################
def process_resume(file_obj):
if file_obj is None:
return None, None
# Extract text based on file type (PDF or image).
resume_text = extract_text_from_file(file_obj)
# Parse basic resume details from the extracted text.
resume_info = extract_basic_resume_info(resume_text)
return resume_text, resume_info
#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
""")
uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
if st.button("Extract Info"):
if uploaded_file is None:
st.error("Please upload a file first.")
else:
with st.spinner("Processing..."):
resume_text, resume_info = process_resume(uploaded_file)
st.subheader("Extracted Resume Text")
st.text_area("", resume_text, height=200)
st.subheader("Parsed Basic Resume Information")
st.json(resume_info) |