Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 15

Commit

8e1d297

verified ·

1 Parent(s): fda9c54

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -68

app.py CHANGED Viewed

@@ -1,71 +1,167 @@
 import os
-from flask import Flask, request, jsonify
-from werkzeug.utils import secure_filename
-from transformers import pipeline
-from pdf2image import convert_from_path
-from PIL import Image
-# Initialize Flask app
-app = Flask(__name__)
-# Set upload folder
-UPLOAD_FOLDER = 'uploads'
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
-# Allowed file extensions
-ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'pdf'}
-# Load TrOCR Model
-ocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-small-printed")
-def allowed_file(filename):
-    """Check if the file has an allowed extension."""
-    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-def extract_text_from_image(image_path):
-    """Extract text from a single image using TrOCR."""
-    image = Image.open(image_path).convert("RGB")
-    text = ocr_pipeline(image)[0]['generated_text']
-    return text
-def extract_text_from_pdf(pdf_path):
-    """Convert PDF to images and extract text from each page."""
-    images = convert_from_path(pdf_path)
-    extracted_text = ""
-    for img in images:
-        text = extract_text_from_image(img)
-        extracted_text += text + "\n"
-    return extracted_text.strip()
-@app.route('/upload', methods=['POST'])
-def upload_file():
-    """Handle file upload and text extraction."""
-    if 'file' not in request.files:
-        return jsonify({"error": "No file uploaded"}), 400
-    file = request.files['file']
-    if file.filename == '':
-        return jsonify({"error": "No file selected"}), 400
-    if not allowed_file(file.filename):
-        return jsonify({"error": "Invalid file type. Allowed: PNG, JPG, JPEG, PDF."}), 400
-    # Save uploaded file
-    filename = secure_filename(file.filename)
-    file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-    file.save(file_path)
-    # Process image or PDF
-    if filename.lower().endswith(".pdf"):
-        extracted_text = extract_text_from_pdf(file_path)
     else:
-        extracted_text = extract_text_from_image(file_path)
-    return jsonify({"extracted_text": extracted_text})
-# Run Flask App
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5000, debug=True)

 import os
+import re
+import torch  # Explicit import if you plan to use torch methods directly
+import tempfile
+from io import BytesIO
+import streamlit as st
+from PIL import Image
+from transformers import pipeline
+from pdf2image import convert_from_bytes
+#####################################
+# Load the OCR Pipeline (Uses Torch)
+#####################################
+try:
+    ocr_pipeline = pipeline("image-to-text", model="alibaba-damo/mgp-str-base")
+    st.write("Model loaded successfully!")
+except Exception as e:
+    st.error(f"Error loading model: {e}")
+    st.stop()
+#####################################
+# Utility: Convert PDF to Images
+#####################################
+def convert_pdf_to_images(pdf_bytes):
+    try:
+        images = convert_from_bytes(pdf_bytes)
+        return images
+    except Exception as e:
+        st.error(f"PDF conversion error: {e}")
+        return []
+#####################################
+# Pipeline: Extract Text with OCR Pipeline
+#####################################
+def extract_text_from_file(file_obj):
+    file_extension = os.path.splitext(file_obj.name)[1].lower()
+    full_text = ""
+    if file_extension == ".pdf":
+        file_bytes = file_obj.read()
+        images = convert_pdf_to_images(file_bytes)
+        for img in images:
+            result = ocr_pipeline(img)
+            if isinstance(result, list) and "text" in result[0]:
+                full_text += result[0]["text"] + "\n"
     else:
+        try:
+            img = Image.open(file_obj)
+            result = ocr_pipeline(img)
+            if isinstance(result, list) and "text" in result[0]:
+                full_text = result[0]["text"]
+        except Exception as e:
+            full_text = f"Error processing image: {e}"
+    return full_text
+#####################################
+# Information Extraction Functions
+#####################################
+def extract_resume_info(text):
+    info = {
+        "Name": None,
+        "Age": None,
+        "Job Experience": None,
+        "Skills": None,
+        "Expected Industry/Direction": None,
+    }
+    # Extract name, e.g., "Name: John Doe"
+    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
+    if name_match:
+        info["Name"] = name_match.group(1).strip()
+    else:
+        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
+        if potential_names:
+            info["Name"] = potential_names[0]
+    # Extract age
+    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
+    if age_match:
+        info["Age"] = age_match.group(1)
+    # Extract job experience (years)
+    exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
+    if exp_match:
+        info["Job Experience"] = exp_match.group(1) + " years"
+    else:
+        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
+        if exp_line:
+            info["Job Experience"] = exp_line.group(2).strip()
+    # Extract skills
+    skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
+    if skills_match:
+        skills_text = skills_match.group(1)
+        skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
+        info["Skills"] = skills
+    # Extract expected industry/direction
+    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
+    if industry_match:
+        info["Expected Industry/Direction"] = industry_match.group(2).strip()
+    return info
+#####################################
+# Candidate Comparison Function
+#####################################
+def compare_candidate_with_company(resume_info, company_requirements):
+    candidate_industry = resume_info.get("Expected Industry/Direction", "")
+    candidate_keywords = set(candidate_industry.lower().split())
+    company_keywords = set(company_requirements.lower().split())
+    common = candidate_keywords.intersection(company_keywords)
+    suitable = len(common) > 0
+    # Check skills matching if available
+    if resume_info.get("Skills"):
+        candidate_skills = {skill.lower() for skill in resume_info["Skills"]}
+        company_skills = set(company_requirements.lower().split())
+        common_skills = candidate_skills.intersection(company_skills)
+        if len(common_skills) >= 1:
+            suitable = True
+    return {
+        "Common Keywords": list(common) if common else [],
+        "Suitable": "Yes" if suitable else "No"
+    }
+#####################################
+# Main Processing Logic
+#####################################
+def process_resume(file_obj, company_requirements):
+    if file_obj is None:
+        return None, None, None
+    resume_text = extract_text_from_file(file_obj)
+    resume_info = extract_resume_info(resume_text)
+    comparison = compare_candidate_with_company(resume_info, company_requirements)
+    return resume_text, resume_info, comparison
+#####################################
+# Streamlit UI
+#####################################
+st.title("Resume Extraction and Candidate Matching")
+st.markdown("""
+This app uses an image-to-text pipeline (powered by `alibaba-damo/mgp-str-base` and PyTorch) to
+extract details from uploaded resume files (PDF or image formats). It then parses critical candidate
+information and compares it against company requirements.
+""")
+uploaded_file = st.file_uploader("Upload Resume (PDF or Image)", type=["pdf", "png", "jpg", "jpeg"])
+company_requirements = st.text_input("Enter Company Requirements/Criteria (e.g., industry, skills)",
+                                     placeholder="Example: Technology, Python, Software Development")
+if st.button("Process Resume"):
+    if uploaded_file is None:
+        st.error("Please upload a file first.")
+    else:
+        with st.spinner("Processing..."):
+            resume_text, resume_info, comparison = process_resume(uploaded_file, company_requirements)
+        st.subheader("Extracted Resume Text")
+        st.text_area("", resume_text, height=200)
+        st.subheader("Parsed Resume Information")
+        st.json(resume_info)
+        st.subheader("Comparison with Company Requirements")
+        st.json(comparison)