Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,11 +5,11 @@ from PIL import Image
|
|
5 |
from transformers import pipeline
|
6 |
from pdfminer.high_level import extract_text
|
7 |
|
8 |
-
# Load and cache the OCR model once at startup
|
9 |
@st.cache_resource(show_spinner=False)
|
10 |
def load_ocr_pipeline():
|
11 |
try:
|
12 |
-
#
|
13 |
ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
|
14 |
return ocr_pipe
|
15 |
except Exception as e:
|
@@ -20,7 +20,7 @@ ocr_pipeline = load_ocr_pipeline()
|
|
20 |
st.write("Model loaded successfully!")
|
21 |
|
22 |
#####################################
|
23 |
-
#
|
24 |
#####################################
|
25 |
def extract_text_from_file(file_obj):
|
26 |
full_text = ""
|
@@ -43,10 +43,17 @@ def extract_text_from_file(file_obj):
|
|
43 |
return full_text
|
44 |
|
45 |
#####################################
|
46 |
-
# Information Extraction Functions
|
47 |
#####################################
|
48 |
def extract_basic_resume_info(text):
|
49 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
info = {
|
51 |
"Name": None,
|
52 |
"Age": None,
|
@@ -55,22 +62,22 @@ def extract_basic_resume_info(text):
|
|
55 |
"Expected Industry/Direction": None,
|
56 |
}
|
57 |
|
58 |
-
# Extract
|
59 |
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
|
60 |
if name_match:
|
61 |
info["Name"] = name_match.group(1).strip()
|
62 |
else:
|
63 |
-
#
|
64 |
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
|
65 |
if potential_names:
|
66 |
info["Name"] = potential_names[0]
|
67 |
|
68 |
-
# Extract
|
69 |
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
|
70 |
if age_match:
|
71 |
info["Age"] = age_match.group(1)
|
72 |
|
73 |
-
# Extract
|
74 |
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
|
75 |
if exp_match:
|
76 |
info["Job Experience"] = exp_match.group(1) + " years"
|
@@ -79,14 +86,14 @@ def extract_basic_resume_info(text):
|
|
79 |
if exp_line:
|
80 |
info["Job Experience"] = exp_line.group(2).strip()
|
81 |
|
82 |
-
# Extract
|
83 |
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
|
84 |
if skills_match:
|
85 |
skills_text = skills_match.group(1)
|
86 |
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
|
87 |
info["Skills"] = skills
|
88 |
|
89 |
-
# Extract
|
90 |
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
|
91 |
if industry_match:
|
92 |
info["Expected Industry/Direction"] = industry_match.group(2).strip()
|
@@ -94,15 +101,15 @@ def extract_basic_resume_info(text):
|
|
94 |
return info
|
95 |
|
96 |
#####################################
|
97 |
-
# Main Processing Logic
|
98 |
#####################################
|
99 |
def process_resume(file_obj):
|
100 |
if file_obj is None:
|
101 |
return None, None
|
102 |
|
103 |
-
# Extract text based on file type (PDF or image)
|
104 |
resume_text = extract_text_from_file(file_obj)
|
105 |
-
# Parse basic resume
|
106 |
resume_info = extract_basic_resume_info(resume_text)
|
107 |
return resume_text, resume_info
|
108 |
|
@@ -111,7 +118,7 @@ def process_resume(file_obj):
|
|
111 |
#####################################
|
112 |
st.title("Resume Extraction and Basic Info Parsing")
|
113 |
st.markdown("""
|
114 |
-
Upload a resume file
|
115 |
""")
|
116 |
|
117 |
uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
|
|
|
5 |
from transformers import pipeline
|
6 |
from pdfminer.high_level import extract_text
|
7 |
|
8 |
+
# Load and cache the OCR model once at startup.
|
9 |
@st.cache_resource(show_spinner=False)
|
10 |
def load_ocr_pipeline():
|
11 |
try:
|
12 |
+
# Initialize the OCR pipeline from transformers. Change the model as needed.
|
13 |
ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
|
14 |
return ocr_pipe
|
15 |
except Exception as e:
|
|
|
20 |
st.write("Model loaded successfully!")
|
21 |
|
22 |
#####################################
|
23 |
+
# Text Extraction Function
|
24 |
#####################################
|
25 |
def extract_text_from_file(file_obj):
|
26 |
full_text = ""
|
|
|
43 |
return full_text
|
44 |
|
45 |
#####################################
|
46 |
+
# Resume Information Extraction Functions
|
47 |
#####################################
|
48 |
def extract_basic_resume_info(text):
|
49 |
+
"""
|
50 |
+
Extract basic resume information, such as:
|
51 |
+
- Name
|
52 |
+
- Age
|
53 |
+
- Job Experience
|
54 |
+
- Skills
|
55 |
+
- Expected Industry/Direction
|
56 |
+
"""
|
57 |
info = {
|
58 |
"Name": None,
|
59 |
"Age": None,
|
|
|
62 |
"Expected Industry/Direction": None,
|
63 |
}
|
64 |
|
65 |
+
# Extract Name: trigger words like 'Name:'
|
66 |
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
|
67 |
if name_match:
|
68 |
info["Name"] = name_match.group(1).strip()
|
69 |
else:
|
70 |
+
# Fallback: heuristic for sequences of capitalized words.
|
71 |
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
|
72 |
if potential_names:
|
73 |
info["Name"] = potential_names[0]
|
74 |
|
75 |
+
# Extract Age:
|
76 |
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
|
77 |
if age_match:
|
78 |
info["Age"] = age_match.group(1)
|
79 |
|
80 |
+
# Extract Job Experience (years)
|
81 |
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
|
82 |
if exp_match:
|
83 |
info["Job Experience"] = exp_match.group(1) + " years"
|
|
|
86 |
if exp_line:
|
87 |
info["Job Experience"] = exp_line.group(2).strip()
|
88 |
|
89 |
+
# Extract Skills (e.g., "Skills: Python, Java, SQL")
|
90 |
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
|
91 |
if skills_match:
|
92 |
skills_text = skills_match.group(1)
|
93 |
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
|
94 |
info["Skills"] = skills
|
95 |
|
96 |
+
# Extract Expected Industry/Direction (e.g., "Interest: Software Development")
|
97 |
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
|
98 |
if industry_match:
|
99 |
info["Expected Industry/Direction"] = industry_match.group(2).strip()
|
|
|
101 |
return info
|
102 |
|
103 |
#####################################
|
104 |
+
# Main Resume Processing Logic
|
105 |
#####################################
|
106 |
def process_resume(file_obj):
|
107 |
if file_obj is None:
|
108 |
return None, None
|
109 |
|
110 |
+
# Extract text based on file type (PDF or image).
|
111 |
resume_text = extract_text_from_file(file_obj)
|
112 |
+
# Parse basic resume details from the extracted text.
|
113 |
resume_info = extract_basic_resume_info(resume_text)
|
114 |
return resume_text, resume_info
|
115 |
|
|
|
118 |
#####################################
|
119 |
st.title("Resume Extraction and Basic Info Parsing")
|
120 |
st.markdown("""
|
121 |
+
Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
|
122 |
""")
|
123 |
|
124 |
uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
|