CR7CAD commited on
Commit
8e1d297
·
verified ·
1 Parent(s): fda9c54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -68
app.py CHANGED
@@ -1,71 +1,167 @@
1
  import os
2
- from flask import Flask, request, jsonify
3
- from werkzeug.utils import secure_filename
4
- from transformers import pipeline
5
- from pdf2image import convert_from_path
6
- from PIL import Image
7
-
8
- # Initialize Flask app
9
- app = Flask(__name__)
10
-
11
- # Set upload folder
12
- UPLOAD_FOLDER = 'uploads'
13
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
14
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
-
16
- # Allowed file extensions
17
- ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'pdf'}
18
-
19
- # Load TrOCR Model
20
- ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-small-printed")
21
-
22
- def allowed_file(filename):
23
- """Check if the file has an allowed extension."""
24
- return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
25
-
26
- def extract_text_from_image(image_path):
27
- """Extract text from a single image using TrOCR."""
28
- image = Image.open(image_path).convert("RGB")
29
- text = ocr_pipeline(image)[0]['generated_text']
30
- return text
31
-
32
- def extract_text_from_pdf(pdf_path):
33
- """Convert PDF to images and extract text from each page."""
34
- images = convert_from_path(pdf_path)
35
- extracted_text = ""
36
-
37
- for img in images:
38
- text = extract_text_from_image(img)
39
- extracted_text += text + "\n"
40
-
41
- return extracted_text.strip()
42
 
43
- @app.route('/upload', methods=['POST'])
44
- def upload_file():
45
- """Handle file upload and text extraction."""
46
- if 'file' not in request.files:
47
- return jsonify({"error": "No file uploaded"}), 400
48
-
49
- file = request.files['file']
50
- if file.filename == '':
51
- return jsonify({"error": "No file selected"}), 400
52
-
53
- if not allowed_file(file.filename):
54
- return jsonify({"error": "Invalid file type. Allowed: PNG, JPG, JPEG, PDF."}), 400
55
-
56
- # Save uploaded file
57
- filename = secure_filename(file.filename)
58
- file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
59
- file.save(file_path)
60
-
61
- # Process image or PDF
62
- if filename.lower().endswith(".pdf"):
63
- extracted_text = extract_text_from_pdf(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  else:
65
- extracted_text = extract_text_from_image(file_path)
66
-
67
- return jsonify({"extracted_text": extracted_text})
68
-
69
- # Run Flask App
70
- if __name__ == '__main__':
71
- app.run(host='0.0.0.0', port=5000, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
+ import torch # Explicit import if you plan to use torch methods directly
4
+ import tempfile
5
+ from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ import streamlit as st
8
+ from PIL import Image
9
+ from transformers import pipeline
10
+ from pdf2image import convert_from_bytes
11
+
12
+ #####################################
13
+ # Load the OCR Pipeline (Uses Torch)
14
+ #####################################
15
+ try:
16
+ ocr_pipeline = pipeline("image-to-text", model="alibaba-damo/mgp-str-base")
17
+ st.write("Model loaded successfully!")
18
+ except Exception as e:
19
+ st.error(f"Error loading model: {e}")
20
+ st.stop()
21
+
22
+ #####################################
23
+ # Utility: Convert PDF to Images
24
+ #####################################
25
+ def convert_pdf_to_images(pdf_bytes):
26
+ try:
27
+ images = convert_from_bytes(pdf_bytes)
28
+ return images
29
+ except Exception as e:
30
+ st.error(f"PDF conversion error: {e}")
31
+ return []
32
+
33
+ #####################################
34
+ # Pipeline: Extract Text with OCR Pipeline
35
+ #####################################
36
+ def extract_text_from_file(file_obj):
37
+ file_extension = os.path.splitext(file_obj.name)[1].lower()
38
+ full_text = ""
39
+
40
+ if file_extension == ".pdf":
41
+ file_bytes = file_obj.read()
42
+ images = convert_pdf_to_images(file_bytes)
43
+ for img in images:
44
+ result = ocr_pipeline(img)
45
+ if isinstance(result, list) and "text" in result[0]:
46
+ full_text += result[0]["text"] + "\n"
47
  else:
48
+ try:
49
+ img = Image.open(file_obj)
50
+ result = ocr_pipeline(img)
51
+ if isinstance(result, list) and "text" in result[0]:
52
+ full_text = result[0]["text"]
53
+ except Exception as e:
54
+ full_text = f"Error processing image: {e}"
55
+
56
+ return full_text
57
+
58
+ #####################################
59
+ # Information Extraction Functions
60
+ #####################################
61
+ def extract_resume_info(text):
62
+ info = {
63
+ "Name": None,
64
+ "Age": None,
65
+ "Job Experience": None,
66
+ "Skills": None,
67
+ "Expected Industry/Direction": None,
68
+ }
69
+
70
+ # Extract name, e.g., "Name: John Doe"
71
+ name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
72
+ if name_match:
73
+ info["Name"] = name_match.group(1).strip()
74
+ else:
75
+ potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
76
+ if potential_names:
77
+ info["Name"] = potential_names[0]
78
+
79
+ # Extract age
80
+ age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
81
+ if age_match:
82
+ info["Age"] = age_match.group(1)
83
+
84
+ # Extract job experience (years)
85
+ exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
86
+ if exp_match:
87
+ info["Job Experience"] = exp_match.group(1) + " years"
88
+ else:
89
+ exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
90
+ if exp_line:
91
+ info["Job Experience"] = exp_line.group(2).strip()
92
+
93
+ # Extract skills
94
+ skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
95
+ if skills_match:
96
+ skills_text = skills_match.group(1)
97
+ skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
98
+ info["Skills"] = skills
99
+
100
+ # Extract expected industry/direction
101
+ industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
102
+ if industry_match:
103
+ info["Expected Industry/Direction"] = industry_match.group(2).strip()
104
+
105
+ return info
106
+
107
+ #####################################
108
+ # Candidate Comparison Function
109
+ #####################################
110
+ def compare_candidate_with_company(resume_info, company_requirements):
111
+ candidate_industry = resume_info.get("Expected Industry/Direction", "")
112
+ candidate_keywords = set(candidate_industry.lower().split())
113
+ company_keywords = set(company_requirements.lower().split())
114
+ common = candidate_keywords.intersection(company_keywords)
115
+ suitable = len(common) > 0
116
+
117
+ # Check skills matching if available
118
+ if resume_info.get("Skills"):
119
+ candidate_skills = {skill.lower() for skill in resume_info["Skills"]}
120
+ company_skills = set(company_requirements.lower().split())
121
+ common_skills = candidate_skills.intersection(company_skills)
122
+ if len(common_skills) >= 1:
123
+ suitable = True
124
+
125
+ return {
126
+ "Common Keywords": list(common) if common else [],
127
+ "Suitable": "Yes" if suitable else "No"
128
+ }
129
+
130
+ #####################################
131
+ # Main Processing Logic
132
+ #####################################
133
+ def process_resume(file_obj, company_requirements):
134
+ if file_obj is None:
135
+ return None, None, None
136
+
137
+ resume_text = extract_text_from_file(file_obj)
138
+ resume_info = extract_resume_info(resume_text)
139
+ comparison = compare_candidate_with_company(resume_info, company_requirements)
140
+ return resume_text, resume_info, comparison
141
+
142
+ #####################################
143
+ # Streamlit UI
144
+ #####################################
145
+ st.title("Resume Extraction and Candidate Matching")
146
+ st.markdown("""
147
+ This app uses an image-to-text pipeline (powered by `alibaba-damo/mgp-str-base` and PyTorch) to
148
+ extract details from uploaded resume files (PDF or image formats). It then parses critical candidate
149
+ information and compares it against company requirements.
150
+ """)
151
+
152
+ uploaded_file = st.file_uploader("Upload Resume (PDF or Image)", type=["pdf", "png", "jpg", "jpeg"])
153
+ company_requirements = st.text_input("Enter Company Requirements/Criteria (e.g., industry, skills)",
154
+ placeholder="Example: Technology, Python, Software Development")
155
+
156
+ if st.button("Process Resume"):
157
+ if uploaded_file is None:
158
+ st.error("Please upload a file first.")
159
+ else:
160
+ with st.spinner("Processing..."):
161
+ resume_text, resume_info, comparison = process_resume(uploaded_file, company_requirements)
162
+ st.subheader("Extracted Resume Text")
163
+ st.text_area("", resume_text, height=200)
164
+ st.subheader("Parsed Resume Information")
165
+ st.json(resume_info)
166
+ st.subheader("Comparison with Company Requirements")
167
+ st.json(comparison)