CR7CAD's picture
Update app.py
92f45fe verified
raw
history blame
4.02 kB
import os
import tempfile
import streamlit as st
from transformers import pipeline
import docx
import textract
#####################################
# Summarization Pipeline Setup
#####################################
@st.cache_resource(show_spinner=False)
def load_summarization_pipeline():
try:
summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
return summarizer
except Exception as e:
st.error(f"Error loading summarization model: {e}")
st.stop()
summarizer = load_summarization_pipeline()
st.write("Summarization model loaded successfully!")
#####################################
# Function to Extract Text from File
#####################################
def extract_text_from_file(file_obj):
"""
Extract text from .txt, .docx, and .doc files.
"""
filename = file_obj.name
ext = os.path.splitext(filename)[1].lower()
text = ""
if ext == ".txt":
# For text files, decode the byte stream into a string.
try:
text = file_obj.read().decode("utf-8")
except Exception as e:
text = f"Error reading text file: {e}"
elif ext == ".docx":
try:
# Use python-docx to read .docx files.
document = docx.Document(file_obj)
text = "\n".join([para.text for para in document.paragraphs])
except Exception as e:
text = f"Error processing DOCX file: {e}"
elif ext == ".doc":
# For .doc files, use textract. textract expects a filename, so save temporarily.
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
tmp.write(file_obj.read())
tmp.flush()
tmp_filename = tmp.name
text = textract.process(tmp_filename).decode("utf-8")
except Exception as e:
text = f"Error processing DOC file: {e}"
finally:
try:
os.remove(tmp_filename)
except Exception:
pass
else:
text = "Unsupported file type."
return text
#####################################
# Function to Summarize Extracted Text
#####################################
def summarize_text(text):
"""
Summarize the given text using the summarization pipeline.
Adjust max_length and min_length as needed.
"""
if not text.strip():
return "No text available to summarize."
try:
# Note: The summarization pipeline can have limitations on text length.
# If you face issues with long documents, consider summarizing in chunks.
summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
return summary[0]["summary_text"]
except Exception as e:
return f"Error during summarization: {e}"
#####################################
# Main Processing Logic
#####################################
def process_resume(file_obj):
if file_obj is None:
return None, None
resume_text = extract_text_from_file(file_obj)
summary_text = summarize_text(resume_text)
return resume_text, summary_text
#####################################
# Streamlit Interface
#####################################
st.title("Resume Summarization App")
st.markdown(
"""
Upload your resume file β€” supported formats: **.doc**, **.docx**, and **.txt**.
The app will extract the text content from your resume and generate a summarization.
"""
)
uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])
if st.button("Summarize Resume"):
if uploaded_file is None:
st.error("Please upload a file first.")
else:
with st.spinner("Processing..."):
resume_text, summary_text = process_resume(uploaded_file)
st.subheader("Extracted Resume Text")
st.text_area("", resume_text, height=250)
st.subheader("Summarized Resume")
st.text_area("", summary_text, height=150)