Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

File size: 4,021 Bytes

cf8a522
92f45fe
8e1d297
 
92f45fe
 
8e1d297
92f45fe
 
 
586dcd2
92f45fe
586dcd2
92f45fe
 
586dcd2
92f45fe
586dcd2
 
92f45fe
 
8e1d297
 
92f45fe
8e1d297
 
92f45fe
 
 
 
 
 
9753cc9
92f45fe
 
9753cc9
92f45fe
9753cc9
92f45fe
 
 
9753cc9
92f45fe
 
 
9753cc9
92f45fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1d297
 
92f45fe
8e1d297
92f45fe
08361f0
92f45fe
 
08361f0
92f45fe
 
 
 
 
 
 
 
 
 
8e1d297
 
92f45fe
8e1d297
6088e9d
8e1d297
6088e9d
8e1d297
 
92f45fe
 
8e1d297
 
586dcd2
8e1d297
92f45fe
 
 
 
 
 
 
8e1d297
92f45fe
8e1d297
92f45fe
8e1d297
9753cc9
8e1d297
 
92f45fe
8e1d297
92f45fe

import os
import tempfile
import streamlit as st
from transformers import pipeline
import docx
import textract

#####################################
# Summarization Pipeline Setup
#####################################
@st.cache_resource(show_spinner=False)
def load_summarization_pipeline():
    try:
        summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
        return summarizer
    except Exception as e:
        st.error(f"Error loading summarization model: {e}")
        st.stop()

summarizer = load_summarization_pipeline()
st.write("Summarization model loaded successfully!")

#####################################
# Function to Extract Text from File
#####################################
def extract_text_from_file(file_obj):
    """
    Extract text from .txt, .docx, and .doc files.
    """
    filename = file_obj.name
    ext = os.path.splitext(filename)[1].lower()
    text = ""
    
    if ext == ".txt":
        # For text files, decode the byte stream into a string.
        try:
            text = file_obj.read().decode("utf-8")
        except Exception as e:
            text = f"Error reading text file: {e}"
    
    elif ext == ".docx":
        try:
            # Use python-docx to read .docx files.
            document = docx.Document(file_obj)
            text = "\n".join([para.text for para in document.paragraphs])
        except Exception as e:
            text = f"Error processing DOCX file: {e}"
    
    elif ext == ".doc":
        # For .doc files, use textract. textract expects a filename, so save temporarily.
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
                tmp.write(file_obj.read())
                tmp.flush()
                tmp_filename = tmp.name
            text = textract.process(tmp_filename).decode("utf-8")
        except Exception as e:
            text = f"Error processing DOC file: {e}"
        finally:
            try:
                os.remove(tmp_filename)
            except Exception:
                pass
    else:
        text = "Unsupported file type."
    
    return text

#####################################
# Function to Summarize Extracted Text
#####################################
def summarize_text(text):
    """
    Summarize the given text using the summarization pipeline.
    Adjust max_length and min_length as needed.
    """
    if not text.strip():
        return "No text available to summarize."
    
    try:
        # Note: The summarization pipeline can have limitations on text length.
        # If you face issues with long documents, consider summarizing in chunks.
        summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
        return summary[0]["summary_text"]
    except Exception as e:
        return f"Error during summarization: {e}"

#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
    if file_obj is None:
        return None, None

    resume_text = extract_text_from_file(file_obj)
    summary_text = summarize_text(resume_text)
    return resume_text, summary_text

#####################################
# Streamlit Interface
#####################################
st.title("Resume Summarization App")
st.markdown(
    """
    Upload your resume file — supported formats: **.doc**, **.docx**, and **.txt**.
    The app will extract the text content from your resume and generate a summarization.
    """
)

uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])

if st.button("Summarize Resume"):
    if uploaded_file is None:
        st.error("Please upload a file first.")
    else:
        with st.spinner("Processing..."):
            resume_text, summary_text = process_resume(uploaded_file)
        st.subheader("Extracted Resume Text")
        st.text_area("", resume_text, height=250)
        st.subheader("Summarized Resume")
        st.text_area("", summary_text, height=150)