Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import streamlit as st | |
from transformers import pipeline | |
import docx | |
import textract | |
##################################### | |
# Summarization Pipeline Setup | |
##################################### | |
def load_summarization_pipeline(): | |
try: | |
summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum") | |
return summarizer | |
except Exception as e: | |
st.error(f"Error loading summarization model: {e}") | |
st.stop() | |
summarizer = load_summarization_pipeline() | |
st.write("Summarization model loaded successfully!") | |
##################################### | |
# Function to Extract Text from File | |
##################################### | |
def extract_text_from_file(file_obj): | |
""" | |
Extract text from .txt, .docx, and .doc files. | |
""" | |
filename = file_obj.name | |
ext = os.path.splitext(filename)[1].lower() | |
text = "" | |
if ext == ".txt": | |
# For text files, decode the byte stream into a string. | |
try: | |
text = file_obj.read().decode("utf-8") | |
except Exception as e: | |
text = f"Error reading text file: {e}" | |
elif ext == ".docx": | |
try: | |
# Use python-docx to read .docx files. | |
document = docx.Document(file_obj) | |
text = "\n".join([para.text for para in document.paragraphs]) | |
except Exception as e: | |
text = f"Error processing DOCX file: {e}" | |
elif ext == ".doc": | |
# For .doc files, use textract. textract expects a filename, so save temporarily. | |
try: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp: | |
tmp.write(file_obj.read()) | |
tmp.flush() | |
tmp_filename = tmp.name | |
text = textract.process(tmp_filename).decode("utf-8") | |
except Exception as e: | |
text = f"Error processing DOC file: {e}" | |
finally: | |
try: | |
os.remove(tmp_filename) | |
except Exception: | |
pass | |
else: | |
text = "Unsupported file type." | |
return text | |
##################################### | |
# Function to Summarize Extracted Text | |
##################################### | |
def summarize_text(text): | |
""" | |
Summarize the given text using the summarization pipeline. | |
Adjust max_length and min_length as needed. | |
""" | |
if not text.strip(): | |
return "No text available to summarize." | |
try: | |
# Note: The summarization pipeline can have limitations on text length. | |
# If you face issues with long documents, consider summarizing in chunks. | |
summary = summarizer(text, max_length=150, min_length=40, do_sample=False) | |
return summary[0]["summary_text"] | |
except Exception as e: | |
return f"Error during summarization: {e}" | |
##################################### | |
# Main Processing Logic | |
##################################### | |
def process_resume(file_obj): | |
if file_obj is None: | |
return None, None | |
resume_text = extract_text_from_file(file_obj) | |
summary_text = summarize_text(resume_text) | |
return resume_text, summary_text | |
##################################### | |
# Streamlit Interface | |
##################################### | |
st.title("Resume Summarization App") | |
st.markdown( | |
""" | |
Upload your resume file β supported formats: **.doc**, **.docx**, and **.txt**. | |
The app will extract the text content from your resume and generate a summarization. | |
""" | |
) | |
uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"]) | |
if st.button("Summarize Resume"): | |
if uploaded_file is None: | |
st.error("Please upload a file first.") | |
else: | |
with st.spinner("Processing..."): | |
resume_text, summary_text = process_resume(uploaded_file) | |
st.subheader("Extracted Resume Text") | |
st.text_area("", resume_text, height=250) | |
st.subheader("Summarized Resume") | |
st.text_area("", summary_text, height=150) |