File size: 4,515 Bytes
cf8a522
8e1d297
 
 
 
9753cc9
8e1d297
9753cc9
586dcd2
 
 
 
 
 
 
 
 
 
 
 
8e1d297
 
9753cc9
8e1d297
 
 
9753cc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1d297
 
 
 
 
586dcd2
 
8e1d297
 
 
 
 
 
 
 
6088e9d
8e1d297
 
 
 
6088e9d
8e1d297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6088e9d
8e1d297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6088e9d
8e1d297
6088e9d
8e1d297
9753cc9
8e1d297
9753cc9
586dcd2
6088e9d
8e1d297
 
586dcd2
8e1d297
586dcd2
8e1d297
9753cc9
8e1d297
 
9753cc9
8e1d297
6088e9d
8e1d297
9753cc9
8e1d297
 
6088e9d
8e1d297
 
586dcd2
6088e9d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import re
import streamlit as st
from PIL import Image
from transformers import pipeline
from pdfminer.high_level import extract_text

# Load and cache the OCR model once at startup
@st.cache_resource(show_spinner=False)
def load_ocr_pipeline():
    try:
        # Ensure your transformers library is up-to-date (>=4.x)
        ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
        return ocr_pipe
    except Exception as e:
        st.error(f"Error loading model: {e}")
        st.stop()

ocr_pipeline = load_ocr_pipeline()
st.write("Model loaded successfully!")

#####################################
# Extract Text from File Function
#####################################
def extract_text_from_file(file_obj):
    full_text = ""
    file_extension = os.path.splitext(file_obj.name)[1].lower()
    
    if file_extension == ".pdf":
        try:
            # Use pdfminer.six to extract text from PDF files.
            full_text = extract_text(file_obj)
        except Exception as e:
            full_text = f"Error processing PDF: {e}"
    else:
        try:
            img = Image.open(file_obj)
            result = ocr_pipeline(img)
            if isinstance(result, list) and "text" in result[0]:
                full_text = result[0]["text"]
        except Exception as e:
            full_text = f"Error processing image: {e}"
    return full_text

#####################################
# Information Extraction Functions
#####################################
def extract_basic_resume_info(text):
    """Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
    info = {
        "Name": None,
        "Age": None,
        "Job Experience": None,
        "Skills": None,
        "Expected Industry/Direction": None,
    }

    # Extract name (e.g., "Name: John Doe")
    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
    if name_match:
        info["Name"] = name_match.group(1).strip()
    else:
        # Heuristic: pick the first sequence of capitalized words
        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
        if potential_names:
            info["Name"] = potential_names[0]

    # Extract age
    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
    if age_match:
        info["Age"] = age_match.group(1)

    # Extract job experience (years)
    exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
    if exp_match:
        info["Job Experience"] = exp_match.group(1) + " years"
    else:
        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
        if exp_line:
            info["Job Experience"] = exp_line.group(2).strip()

    # Extract skills (e.g., "Skills: Python, Java, SQL")
    skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
    if skills_match:
        skills_text = skills_match.group(1)
        skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
        info["Skills"] = skills

    # Extract expected industry/direction
    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
    if industry_match:
        info["Expected Industry/Direction"] = industry_match.group(2).strip()

    return info

#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
    if file_obj is None:
        return None, None

    # Extract text based on file type (PDF or image)
    resume_text = extract_text_from_file(file_obj)
    # Parse basic resume info using heuristics
    resume_info = extract_basic_resume_info(resume_text)
    return resume_text, resume_info

#####################################
# Streamlit Interface
#####################################
st.title("Resume Extraction and Basic Info Parsing")
st.markdown("""
Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
""")

uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])

if st.button("Extract Info"):
    if uploaded_file is None:
        st.error("Please upload a file first.")
    else:
        with st.spinner("Processing..."):
            resume_text, resume_info = process_resume(uploaded_file)
        st.subheader("Extracted Resume Text")
        st.text_area("", resume_text, height=200)
        st.subheader("Parsed Basic Resume Information")
        st.json(resume_info)