Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,71 +1,167 @@
|
|
1 |
import os
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
from PIL import Image
|
7 |
-
|
8 |
-
# Initialize Flask app
|
9 |
-
app = Flask(__name__)
|
10 |
-
|
11 |
-
# Set upload folder
|
12 |
-
UPLOAD_FOLDER = 'uploads'
|
13 |
-
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
14 |
-
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
15 |
-
|
16 |
-
# Allowed file extensions
|
17 |
-
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'pdf'}
|
18 |
-
|
19 |
-
# Load TrOCR Model
|
20 |
-
ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-small-printed")
|
21 |
-
|
22 |
-
def allowed_file(filename):
|
23 |
-
"""Check if the file has an allowed extension."""
|
24 |
-
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
25 |
-
|
26 |
-
def extract_text_from_image(image_path):
|
27 |
-
"""Extract text from a single image using TrOCR."""
|
28 |
-
image = Image.open(image_path).convert("RGB")
|
29 |
-
text = ocr_pipeline(image)[0]['generated_text']
|
30 |
-
return text
|
31 |
-
|
32 |
-
def extract_text_from_pdf(pdf_path):
|
33 |
-
"""Convert PDF to images and extract text from each page."""
|
34 |
-
images = convert_from_path(pdf_path)
|
35 |
-
extracted_text = ""
|
36 |
-
|
37 |
-
for img in images:
|
38 |
-
text = extract_text_from_image(img)
|
39 |
-
extracted_text += text + "\n"
|
40 |
-
|
41 |
-
return extracted_text.strip()
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
else:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
+
import torch # Explicit import if you plan to use torch methods directly
|
4 |
+
import tempfile
|
5 |
+
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
import streamlit as st
|
8 |
+
from PIL import Image
|
9 |
+
from transformers import pipeline
|
10 |
+
from pdf2image import convert_from_bytes
|
11 |
+
|
12 |
+
#####################################
|
13 |
+
# Load the OCR Pipeline (Uses Torch)
|
14 |
+
#####################################
|
15 |
+
try:
|
16 |
+
ocr_pipeline = pipeline("image-to-text", model="alibaba-damo/mgp-str-base")
|
17 |
+
st.write("Model loaded successfully!")
|
18 |
+
except Exception as e:
|
19 |
+
st.error(f"Error loading model: {e}")
|
20 |
+
st.stop()
|
21 |
+
|
22 |
+
#####################################
|
23 |
+
# Utility: Convert PDF to Images
|
24 |
+
#####################################
|
25 |
+
def convert_pdf_to_images(pdf_bytes):
|
26 |
+
try:
|
27 |
+
images = convert_from_bytes(pdf_bytes)
|
28 |
+
return images
|
29 |
+
except Exception as e:
|
30 |
+
st.error(f"PDF conversion error: {e}")
|
31 |
+
return []
|
32 |
+
|
33 |
+
#####################################
|
34 |
+
# Pipeline: Extract Text with OCR Pipeline
|
35 |
+
#####################################
|
36 |
+
def extract_text_from_file(file_obj):
|
37 |
+
file_extension = os.path.splitext(file_obj.name)[1].lower()
|
38 |
+
full_text = ""
|
39 |
+
|
40 |
+
if file_extension == ".pdf":
|
41 |
+
file_bytes = file_obj.read()
|
42 |
+
images = convert_pdf_to_images(file_bytes)
|
43 |
+
for img in images:
|
44 |
+
result = ocr_pipeline(img)
|
45 |
+
if isinstance(result, list) and "text" in result[0]:
|
46 |
+
full_text += result[0]["text"] + "\n"
|
47 |
else:
|
48 |
+
try:
|
49 |
+
img = Image.open(file_obj)
|
50 |
+
result = ocr_pipeline(img)
|
51 |
+
if isinstance(result, list) and "text" in result[0]:
|
52 |
+
full_text = result[0]["text"]
|
53 |
+
except Exception as e:
|
54 |
+
full_text = f"Error processing image: {e}"
|
55 |
+
|
56 |
+
return full_text
|
57 |
+
|
58 |
+
#####################################
|
59 |
+
# Information Extraction Functions
|
60 |
+
#####################################
|
61 |
+
def extract_resume_info(text):
|
62 |
+
info = {
|
63 |
+
"Name": None,
|
64 |
+
"Age": None,
|
65 |
+
"Job Experience": None,
|
66 |
+
"Skills": None,
|
67 |
+
"Expected Industry/Direction": None,
|
68 |
+
}
|
69 |
+
|
70 |
+
# Extract name, e.g., "Name: John Doe"
|
71 |
+
name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
|
72 |
+
if name_match:
|
73 |
+
info["Name"] = name_match.group(1).strip()
|
74 |
+
else:
|
75 |
+
potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
|
76 |
+
if potential_names:
|
77 |
+
info["Name"] = potential_names[0]
|
78 |
+
|
79 |
+
# Extract age
|
80 |
+
age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
|
81 |
+
if age_match:
|
82 |
+
info["Age"] = age_match.group(1)
|
83 |
+
|
84 |
+
# Extract job experience (years)
|
85 |
+
exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
|
86 |
+
if exp_match:
|
87 |
+
info["Job Experience"] = exp_match.group(1) + " years"
|
88 |
+
else:
|
89 |
+
exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
|
90 |
+
if exp_line:
|
91 |
+
info["Job Experience"] = exp_line.group(2).strip()
|
92 |
+
|
93 |
+
# Extract skills
|
94 |
+
skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
|
95 |
+
if skills_match:
|
96 |
+
skills_text = skills_match.group(1)
|
97 |
+
skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
|
98 |
+
info["Skills"] = skills
|
99 |
+
|
100 |
+
# Extract expected industry/direction
|
101 |
+
industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
|
102 |
+
if industry_match:
|
103 |
+
info["Expected Industry/Direction"] = industry_match.group(2).strip()
|
104 |
+
|
105 |
+
return info
|
106 |
+
|
107 |
+
#####################################
|
108 |
+
# Candidate Comparison Function
|
109 |
+
#####################################
|
110 |
+
def compare_candidate_with_company(resume_info, company_requirements):
|
111 |
+
candidate_industry = resume_info.get("Expected Industry/Direction", "")
|
112 |
+
candidate_keywords = set(candidate_industry.lower().split())
|
113 |
+
company_keywords = set(company_requirements.lower().split())
|
114 |
+
common = candidate_keywords.intersection(company_keywords)
|
115 |
+
suitable = len(common) > 0
|
116 |
+
|
117 |
+
# Check skills matching if available
|
118 |
+
if resume_info.get("Skills"):
|
119 |
+
candidate_skills = {skill.lower() for skill in resume_info["Skills"]}
|
120 |
+
company_skills = set(company_requirements.lower().split())
|
121 |
+
common_skills = candidate_skills.intersection(company_skills)
|
122 |
+
if len(common_skills) >= 1:
|
123 |
+
suitable = True
|
124 |
+
|
125 |
+
return {
|
126 |
+
"Common Keywords": list(common) if common else [],
|
127 |
+
"Suitable": "Yes" if suitable else "No"
|
128 |
+
}
|
129 |
+
|
130 |
+
#####################################
|
131 |
+
# Main Processing Logic
|
132 |
+
#####################################
|
133 |
+
def process_resume(file_obj, company_requirements):
|
134 |
+
if file_obj is None:
|
135 |
+
return None, None, None
|
136 |
+
|
137 |
+
resume_text = extract_text_from_file(file_obj)
|
138 |
+
resume_info = extract_resume_info(resume_text)
|
139 |
+
comparison = compare_candidate_with_company(resume_info, company_requirements)
|
140 |
+
return resume_text, resume_info, comparison
|
141 |
+
|
142 |
+
#####################################
|
143 |
+
# Streamlit UI
|
144 |
+
#####################################
|
145 |
+
st.title("Resume Extraction and Candidate Matching")
|
146 |
+
st.markdown("""
|
147 |
+
This app uses an image-to-text pipeline (powered by `alibaba-damo/mgp-str-base` and PyTorch) to
|
148 |
+
extract details from uploaded resume files (PDF or image formats). It then parses critical candidate
|
149 |
+
information and compares it against company requirements.
|
150 |
+
""")
|
151 |
+
|
152 |
+
uploaded_file = st.file_uploader("Upload Resume (PDF or Image)", type=["pdf", "png", "jpg", "jpeg"])
|
153 |
+
company_requirements = st.text_input("Enter Company Requirements/Criteria (e.g., industry, skills)",
|
154 |
+
placeholder="Example: Technology, Python, Software Development")
|
155 |
+
|
156 |
+
if st.button("Process Resume"):
|
157 |
+
if uploaded_file is None:
|
158 |
+
st.error("Please upload a file first.")
|
159 |
+
else:
|
160 |
+
with st.spinner("Processing..."):
|
161 |
+
resume_text, resume_info, comparison = process_resume(uploaded_file, company_requirements)
|
162 |
+
st.subheader("Extracted Resume Text")
|
163 |
+
st.text_area("", resume_text, height=200)
|
164 |
+
st.subheader("Parsed Resume Information")
|
165 |
+
st.json(resume_info)
|
166 |
+
st.subheader("Comparison with Company Requirements")
|
167 |
+
st.json(comparison)
|