Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 16

Commit

92f45fe

verified ·

1 Parent(s): 08361f0

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -93

app.py CHANGED Viewed

@@ -1,135 +1,121 @@
 import os
-import re
 import streamlit as st
-from PIL import Image
 from transformers import pipeline
-from pdfminer.high_level import extract_text
-# Load and cache the OCR model once at startup.
 @st.cache_resource(show_spinner=False)
-def load_ocr_pipeline():
     try:
-        # Initialize the OCR pipeline from transformers. Change the model as needed.
-        ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
-        return ocr_pipe
     except Exception as e:
-        st.error(f"Error loading model: {e}")
         st.stop()
-ocr_pipeline = load_ocr_pipeline()
-st.write("Model loaded successfully!")
 #####################################
-# Text Extraction Function
 #####################################
 def extract_text_from_file(file_obj):
-    full_text = ""
-    file_extension = os.path.splitext(file_obj.name)[1].lower()
-    if file_extension == ".pdf":
         try:
-            # Use pdfminer.six to extract text from PDF files.
-            full_text = extract_text(file_obj)
         except Exception as e:
-            full_text = f"Error processing PDF: {e}"
-    else:
         try:
-            img = Image.open(file_obj)
-            result = ocr_pipeline(img)
-            if isinstance(result, list) and "text" in result[0]:
-                full_text = result[0]["text"]
         except Exception as e:
-            full_text = f"Error processing image: {e}"
-    return full_text
 #####################################
-# Resume Information Extraction Functions
 #####################################
-def extract_basic_resume_info(text):
     """
-    Extract basic resume information, such as:
-    - Name
-    - Age
-    - Job Experience
-    - Skills
-    - Expected Industry/Direction
     """
-    info = {
-        "Name": None,
-        "Age": None,
-        "Job Experience": None,
-        "Skills": None,
-        "Expected Industry/Direction": None,
-    }
-    # Extract Name: trigger words like 'Name:'
-    name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
-    if name_match:
-        info["Name"] = name_match.group(1).strip()
-    else:
-        # Fallback: heuristic for sequences of capitalized words.
-        potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
-        if potential_names:
-            info["Name"] = potential_names[0]
-    # Extract Age:
-    age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
-    if age_match:
-        info["Age"] = age_match.group(1)
-    # Extract Job Experience (years)
-    exp_match = re.search(r"(\d+)\s+(?:years|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
-    if exp_match:
-        info["Job Experience"] = exp_match.group(1) + " years"
-    else:
-        exp_line = re.search(r"(Experience|Background)[:\-]\s*(.*)", text, re.IGNORECASE)
-        if exp_line:
-            info["Job Experience"] = exp_line.group(2).strip()
-    # Extract Skills (e.g., "Skills: Python, Java, SQL")
-    skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
-    if skills_match:
-        skills_text = skills_match.group(1)
-        skills = [s.strip() for s in re.split(r",|\n", skills_text) if s.strip()]
-        info["Skills"] = skills
-    # Extract Expected Industry/Direction (e.g., "Interest: Software Development")
-    industry_match = re.search(r"(Industry|Interest|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
-    if industry_match:
-        info["Expected Industry/Direction"] = industry_match.group(2).strip()
-    return info
 #####################################
-# Main Resume Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
-    # Extract text based on file type (PDF or image).
     resume_text = extract_text_from_file(file_obj)
-    # Parse basic resume details from the extracted text.
-    resume_info = extract_basic_resume_info(resume_text)
-    return resume_text, resume_info
 #####################################
 # Streamlit Interface
 #####################################
-st.title("Resume Extraction and Basic Info Parsing")
-st.markdown("""
-Upload a resume file in PDF or image format (PNG, JPG, JPEG) to extract text and candidate information.
-""")
-uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])
-if st.button("Extract Info"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
         with st.spinner("Processing..."):
-            resume_text, resume_info = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
-        st.text_area("", resume_text, height=200)
-        st.subheader("Parsed Basic Resume Information")
-        st.json(resume_info)

 import os
+import tempfile
 import streamlit as st
 from transformers import pipeline
+import docx
+import textract
+#####################################
+# Summarization Pipeline Setup
+#####################################
 @st.cache_resource(show_spinner=False)
+def load_summarization_pipeline():
     try:
+        summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
+        return summarizer
     except Exception as e:
+        st.error(f"Error loading summarization model: {e}")
         st.stop()
+summarizer = load_summarization_pipeline()
+st.write("Summarization model loaded successfully!")
 #####################################
+# Function to Extract Text from File
 #####################################
 def extract_text_from_file(file_obj):
+    """
+    Extract text from .txt, .docx, and .doc files.
+    """
+    filename = file_obj.name
+    ext = os.path.splitext(filename)[1].lower()
+    text = ""
+    if ext == ".txt":
+        # For text files, decode the byte stream into a string.
         try:
+            text = file_obj.read().decode("utf-8")
         except Exception as e:
+            text = f"Error reading text file: {e}"
+    elif ext == ".docx":
         try:
+            # Use python-docx to read .docx files.
+            document = docx.Document(file_obj)
+            text = "\n".join([para.text for para in document.paragraphs])
         except Exception as e:
+            text = f"Error processing DOCX file: {e}"
+    elif ext == ".doc":
+        # For .doc files, use textract. textract expects a filename, so save temporarily.
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
+                tmp.write(file_obj.read())
+                tmp.flush()
+                tmp_filename = tmp.name
+            text = textract.process(tmp_filename).decode("utf-8")
+        except Exception as e:
+            text = f"Error processing DOC file: {e}"
+        finally:
+            try:
+                os.remove(tmp_filename)
+            except Exception:
+                pass
+    else:
+        text = "Unsupported file type."
+    return text
 #####################################
+# Function to Summarize Extracted Text
 #####################################
+def summarize_text(text):
     """
+    Summarize the given text using the summarization pipeline.
+    Adjust max_length and min_length as needed.
     """
+    if not text.strip():
+        return "No text available to summarize."
+    try:
+        # Note: The summarization pipeline can have limitations on text length.
+        # If you face issues with long documents, consider summarizing in chunks.
+        summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
+        return summary[0]["summary_text"]
+    except Exception as e:
+        return f"Error during summarization: {e}"
 #####################################
+# Main Processing Logic
 #####################################
 def process_resume(file_obj):
     if file_obj is None:
         return None, None
     resume_text = extract_text_from_file(file_obj)
+    summary_text = summarize_text(resume_text)
+    return resume_text, summary_text
 #####################################
 # Streamlit Interface
 #####################################
+st.title("Resume Summarization App")
+st.markdown(
+    """
+    Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
+    The app will extract the text content from your resume and generate a summarization.
+    """
+)
+uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
+if st.button("Summarize Resume"):
     if uploaded_file is None:
         st.error("Please upload a file first.")
     else:
         with st.spinner("Processing..."):
+            resume_text, summary_text = process_resume(uploaded_file)
         st.subheader("Extracted Resume Text")
+        st.text_area("", resume_text, height=250)
+        st.subheader("Summarized Resume")
+        st.text_area("", summary_text, height=150)