Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

92f45fe verified 4 months ago

raw

history blame

4.02 kB

	import os
	import tempfile
	import streamlit as st
	from transformers import pipeline
	import docx
	import textract

	#####################################
	# Summarization Pipeline Setup
	#####################################
	@st.cache_resource(show_spinner=False)
	def load_summarization_pipeline():
	try:
	summarizer = pipeline("summarization", model="recogna-nlp/ptt5-base-summ-xlsum")
	return summarizer
	except Exception as e:
	st.error(f"Error loading summarization model: {e}")
	st.stop()

	summarizer = load_summarization_pipeline()
	st.write("Summarization model loaded successfully!")

	#####################################
	# Function to Extract Text from File
	#####################################
	def extract_text_from_file(file_obj):
	"""
	Extract text from .txt, .docx, and .doc files.
	"""
	filename = file_obj.name
	ext = os.path.splitext(filename)[1].lower()
	text = ""

	if ext == ".txt":
	# For text files, decode the byte stream into a string.
	try:
	text = file_obj.read().decode("utf-8")
	except Exception as e:
	text = f"Error reading text file: {e}"

	elif ext == ".docx":
	try:
	# Use python-docx to read .docx files.
	document = docx.Document(file_obj)
	text = "\n".join([para.text for para in document.paragraphs])
	except Exception as e:
	text = f"Error processing DOCX file: {e}"

	elif ext == ".doc":
	# For .doc files, use textract. textract expects a filename, so save temporarily.
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as tmp:
	tmp.write(file_obj.read())
	tmp.flush()
	tmp_filename = tmp.name
	text = textract.process(tmp_filename).decode("utf-8")
	except Exception as e:
	text = f"Error processing DOC file: {e}"
	finally:
	try:
	os.remove(tmp_filename)
	except Exception:
	pass
	else:
	text = "Unsupported file type."

	return text

	#####################################
	# Function to Summarize Extracted Text
	#####################################
	def summarize_text(text):
	"""
	Summarize the given text using the summarization pipeline.
	Adjust max_length and min_length as needed.
	"""
	if not text.strip():
	return "No text available to summarize."

	try:
	# Note: The summarization pipeline can have limitations on text length.
	# If you face issues with long documents, consider summarizing in chunks.
	summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
	return summary[0]["summary_text"]
	except Exception as e:
	return f"Error during summarization: {e}"

	#####################################
	# Main Processing Logic
	#####################################
	def process_resume(file_obj):
	if file_obj is None:
	return None, None

	resume_text = extract_text_from_file(file_obj)
	summary_text = summarize_text(resume_text)
	return resume_text, summary_text

	#####################################
	# Streamlit Interface
	#####################################
	st.title("Resume Summarization App")
	st.markdown(
	"""
	Upload your resume file — supported formats: .doc, .docx, and .txt.
	The app will extract the text content from your resume and generate a summarization.
	"""
	)

	uploaded_file = st.file_uploader("Upload Resume", type=["doc", "docx", "txt"])

	if st.button("Summarize Resume"):
	if uploaded_file is None:
	st.error("Please upload a file first.")
	else:
	with st.spinner("Processing..."):
	resume_text, summary_text = process_resume(uploaded_file)
	st.subheader("Extracted Resume Text")
	st.text_area("", resume_text, height=250)
	st.subheader("Summarized Resume")
	st.text_area("", summary_text, height=150)