File size: 4,660 Bytes
cf8a522
8e1d297
 
 
 
9753cc9
8e1d297
08361f0
586dcd2
 
 
08361f0
586dcd2
 
 
 
 
 
 
 
8e1d297
 
08361f0
8e1d297
 
 
9753cc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1d297
 
 
08361f0
8e1d297
586dcd2
08361f0
 
 
 
 
 
 
 
8e1d297
 
 
 
 
 
 
 
08361f0
8e1d297
 
 
 
08361f0
8e1d297
 
 
 
08361f0
8e1d297
 
 
 
08361f0
8e1d297
 
 
 
 
 
 
 
08361f0
8e1d297
 
 
 
 
 
08361f0
8e1d297
 
 
 
 
 
 
08361f0
8e1d297
6088e9d
8e1d297
6088e9d
8e1d297
08361f0
8e1d297
08361f0
586dcd2
6088e9d
8e1d297
 
586dcd2
8e1d297
586dcd2
8e1d297
08361f0
8e1d297
 
9753cc9
8e1d297
6088e9d
8e1d297
9753cc9
8e1d297
 
6088e9d
8e1d297
 
586dcd2
6088e9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import re
import streamlit as st
from PIL import Image
from transformers import pipeline
from pdfminer.high_level import extract_text

# Load and cache the OCR model once at startup.
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
    try:
        # Initialize the OCR pipeline from transformers. Change the model as needed.
        ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
        return ocr_pipe
    except Exception as e:
        st.error(f"Error loading model: {e}")
        st.stop()

ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")

#####################################
# Text Extraction Function
#####################################
def extract_text_from_file(file_obj):
    full_text = ""
    file_extension = os.path.splitext(file_obj.name)[1].lower()
    
    if file_extension == ".pdf":
        try:
            # Use pdfminer.six to extract text from PDF files.
            full_text = extract_text(file_obj)
        except Exception as e:
            full_text = f"Error processing PDF: {e}"
    else:
        try:
            img = Image.open(file_obj)
            result = ocr_pipeline(img)
            if isinstance(result, list) and "text" in result[0]:
                full_text = result[0]["text"]
        except Exception as e:
            full_text = f"Error processing image: {e}"
    return full_text

#####################################
# Resume Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
    """
    Extract basic resume information, such as:
    - Name
    - Age
    - Job Experience
    - Skills
    - Expected Industry/Direction
    """
    info = {
        "Name": None,
        "Age": None,
        "Job Experience": None,
        "Skills": None,
        "Expected Industry/Direction": None,
    }

    # Extract Name: trigger words like 'Name:'
    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
    if name_match:
        info["Name"] = name_match.group(1).strip()
    else:
        # Fallback: heuristic for sequences of capitalized words.
        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
        if potential_names:
            info["Name"] = potential_names[0]

    # Extract Age:
    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
    if age_match:
        info["Age"] = age_match.group(1)

    # Extract Job Experience (years)
    exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
    if exp_match:
        info["Job Experience"] = exp_match.group(1) + " years"
    else:
        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
        if exp_line:
            info["Job Experience"] = exp_line.group(2).strip()

    # Extract Skills (e.g., "Skills: Python, Java, SQL")
    skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
        info["Skills"] = skills

    # Extract Expected Industry/Direction (e.g., "Interest: Software Development")
    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
    if industry_match:
        info["Expected Industry/Direction"] = industry_match.group(2).strip()

    return info

#####################################
# Main Resume Processing Logic
#####################################
def process_resume(file_obj):
    if file_obj is None:
        return None, None

    # Extract text based on file type (PDF or image).
    resume_text = extract_text_from_file(file_obj)
    # Parse basic resume details from the extracted text.
    resume_info = extract_basic_resume_info(resume_text)
    return resume_text, resume_info

#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
""")

uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])

if st.button("Extract Info"):
    if uploaded_file is None:
        st.error("Please upload a file first.")
    else:
        with st.spinner("Processing..."):
            resume_text, resume_info = process_resume(uploaded_file)
        st.subheader("Extracted Resume Text")
        st.text_area("", resume_text, height=200)
        st.subheader("Parsed Basic Resume Information")
        st.json(resume_info)