Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

9753cc9 verified about 2 months ago

raw

history blame

4.52 kB

	import os
	import re
	import streamlit as st
	from PIL import Image
	from transformers import pipeline
	from pdfminer.high_level import extract_text

	# Load and cache the OCR model once at startup
	@st.cache_resource(show_spinner=False)
	def load_ocr_pipeline():
	try:
	# Ensure your transformers library is up-to-date (>=4.x)
	ocr_pipe = pipeline("image-to-text", model="YouLiXiya/tinyllava-v1.0-1.1b-hf")
	return ocr_pipe
	except Exception as e:
	st.error(f"Error loading model: {e}")
	st.stop()

	ocr_pipeline = load_ocr_pipeline()
	st.write("Model loaded successfully!")

	#####################################
	# Extract Text from File Function
	#####################################
	def extract_text_from_file(file_obj):
	full_text = ""
	file_extension = os.path.splitext(file_obj.name)[1].lower()

	if file_extension == ".pdf":
	try:
	# Use pdfminer.six to extract text from PDF files.
	full_text = extract_text(file_obj)
	except Exception as e:
	full_text = f"Error processing PDF: {e}"
	else:
	try:
	img = Image.open(file_obj)
	result = ocr_pipeline(img)
	if isinstance(result, list) and "text" in result[0]:
	full_text = result[0]["text"]
	except Exception as e:
	full_text = f"Error processing image: {e}"
	return full_text

	#####################################
	# Information Extraction Functions
	#####################################
	def extract_basic_resume_info(text):
	"""Extract basic resume info: Name, Age, Job Experience, Skills, Expected Industry/Direction."""
	info = {
	"Name": None,
	"Age": None,
	"Job Experience": None,
	"Skills": None,
	"Expected Industry/Direction": None,
	}

	# Extract name (e.g., "Name: John Doe")
	name_match = re.search(r"[Nn]ame[:\-]\s*([A-Za-z\s]+)", text)
	if name_match:
	info["Name"] = name_match.group(1).strip()
	else:
	# Heuristic: pick the first sequence of capitalized words
	potential_names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b', text)
	if potential_names:
	info["Name"] = potential_names[0]

	# Extract age
	age_match = re.search(r"[Aa]ge[:\-]\s*(\d{1,2})", text)
	if age_match:
	info["Age"] = age_match.group(1)

	# Extract job experience (years)
	exp_match = re.search(r"(\d+)\s+(?:years\|yrs)\s+(?:of\s+)?experience", text, re.IGNORECASE)
	if exp_match:
	info["Job Experience"] = exp_match.group(1) + " years"
	else:
	exp_line = re.search(r"(Experience\|Background)[:\-]\s(.)", text, re.IGNORECASE)
	if exp_line:
	info["Job Experience"] = exp_line.group(2).strip()

	# Extract skills (e.g., "Skills: Python, Java, SQL")
	skills_match = re.search(r"[Ss]kills[:\-]\s*(.+)", text)
	if skills_match:
	skills_text = skills_match.group(1)
	skills = [s.strip() for s in re.split(r",\|\n", skills_text) if s.strip()]
	info["Skills"] = skills

	# Extract expected industry/direction
	industry_match = re.search(r"(Industry\|Interest\|Direction)[:\-]\s*(.+)", text, re.IGNORECASE)
	if industry_match:
	info["Expected Industry/Direction"] = industry_match.group(2).strip()

	return info

	#####################################
	# Main Processing Logic
	#####################################
	def process_resume(file_obj):
	if file_obj is None:
	return None, None

	# Extract text based on file type (PDF or image)
	resume_text = extract_text_from_file(file_obj)
	# Parse basic resume info using heuristics
	resume_info = extract_basic_resume_info(resume_text)
	return resume_text, resume_info

	#####################################
	# Streamlit Interface
	#####################################
	st.title("Resume Extraction and Basic Info Parsing")
	st.markdown("""
	Upload a resume file (PDF, PNG, JPG, or JPEG) to extract basic text and candidate information.
	""")

	uploaded_file = st.file_uploader("Upload Resume", type=["pdf", "png", "jpg", "jpeg"])

	if st.button("Extract Info"):
	if uploaded_file is None:
	st.error("Please upload a file first.")
	else:
	with st.spinner("Processing..."):
	resume_text, resume_info = process_resume(uploaded_file)
	st.subheader("Extracted Resume Text")
	st.text_area("", resume_text, height=200)
	st.subheader("Parsed Basic Resume Information")
	st.json(resume_info)