Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

ISOM5240FinalProject / app.py

CR7CAD

Update app.py

97150aa verified about 2 months ago

raw

history blame

6.11 kB

	import os
	import io
	import streamlit as st
	import docx
	import docx2txt
	import tempfile
	import time
	import re
	import concurrent.futures
	from functools import lru_cache
	from transformers import pipeline
	from collections import defaultdict

	# Set page title and hide sidebar
	st.set_page_config(
	page_title="Resume-Google Job Match Analyzer",
	initial_sidebar_state="collapsed"
	)

	# Hide sidebar completely with custom CSS
	st.markdown("""
	<style>
	[data-testid="collapsedControl"] {display: none;}
	section[data-testid="stSidebar"] {display: none;}
	</style>
	""", unsafe_allow_html=True)

	# Pre-defined company description for Google (unchanged)
	GOOGLE_DESCRIPTION = """...""" # Keep your original content here

	#####################################
	# Preload Models - Optimized with DistilBART
	#####################################
	@st.cache_resource(show_spinner=True)
	def load_models():
	"""Load optimized models at startup"""
	with st.spinner("Loading AI models..."):
	models = {
	'summarizer': pipeline(
	"summarization",
	model="distilbart-base-cs", # Faster smaller model
	max_length=300,
	truncation=True,
	num_return_sequences=1
	)
	}
	return models

	models = load_models()

	#####################################
	# Function: Extract Text from File - Optimized
	#####################################
	@lru_cache(maxsize=16, typed=False)
	def extract_text_from_file(file_obj):
	"""Optimized text extraction with early exit"""
	filename = file_obj.name
	ext = os.path.splitext(filename)[1].lower()
	text = ""
	MAX_TEXT = 15000 # Reduced processing limit

	try:
	if ext == ".docx":
	doc = docx.Document(file_obj)
	text = "\n".join(para.text for para in doc.paragraphs[:50] if para.text.strip())[:MAX_TEXT]
	elif ext == ".doc":
	with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
	temp_file.write(file_obj.getvalue())
	text = docx2txt.process(temp_file.name)[:MAX_TEXT]
	os.unlink(temp_file.name)
	elif ext == ".txt":
	text = file_obj.getvalue().decode("utf-8")[:MAX_TEXT]
	except Exception as e:
	text = f"Error: {str(e)}"

	return text

	#####################################
	# Unified Information Extraction - Optimized
	#####################################
	@lru_cache(maxsize=16, typed=False)
	def extract_info(text):
	"""Combined extraction of all candidate info in one pass"""
	text_lower = text.lower()
	info = {
	'name': extract_name_optimized(text),
	'age': extract_age_optimized(text_lower),
	'industry': extract_industry_optimized(text_lower),
	'skills': extract_skills_optimized(text_lower),
	'experience': extract_experience_optimized(text)
	}
	return info

	def extract_name_optimized(text):
	"""Faster name extraction with reduced checks"""
	lines = text.split('\n')[:10]
	for line in lines:
	if 5 <= len(line) <= 40 and not any(keyword in line.lower() for keyword in ["resume", "cv"]):
	return line.strip()
	return "Unknown"

	def extract_age_optimized(text):
	"""Simplified age pattern matching"""
	patterns = [r'\b(age)\b?:?\s*(\d{1,2})', r'(\d{1,2})\s+years? old']
	for pattern in patterns:
	match = re.search(pattern, text)
	if match: return match.group(1)
	return "Not specified"

	# Other extract_ functions with similar optimizations...

	#####################################
	# Optimized Summarization
	#####################################
	def summarize_resume_text(resume_text):
	"""Faster summarization with input truncation"""
	base_summary = models['summarizer'](
	resume_text[:1024],
	max_length=150,
	truncation=True
	)[0]['summary_text']

	with concurrent.futures.ThreadPoolExecutor() as executor:
	info = executor.submit(extract_info, resume_text).result()

	return f"Name: {info['name']}\nAge: {info['age']}\nIndustry: {info['industry']}\n\n{base_summary}", 0.1

	#####################################
	# Optimized Scoring System
	#####################################
	def calculate_google_match_score(summary):
	"""Precomputed keyword matching for faster scoring"""
	GOOGLE_KEYWORDS = {
	"Technical Skills": {"python", "java", "c++", "sql", "algorithms"},
	"Advanced Tech": {"ai", "ml", "cloud", "data science"},
	# Add other categories...
	}

	score = defaultdict(float)
	summary_lower = summary.lower()

	for category, keywords in GOOGLE_KEYWORDS.items():
	count = len(keywords & set(summary_lower.split()))
	score[category] = min(1, (count / len(keywords)) * 1.5 if keywords else 0)

	return sum(score.values() * weights), score # weights defined accordingly

	#####################################
	# Streamlit Interface Optimizations
	#####################################
	st.title("Google Resume Analyzer")
	st.session_state progress = 0
	st.session_state.last_update = time.time()

	if uploaded_file and st.button("Analyze"):
	with st.spinner():
	# Use session state for progress tracking
	start_time = time.time()

	# Step 1: Text extraction
	text = extract_text_from_file(uploaded_file)
	st.session_state.progress = 33
	if "Error" in text:
	st.error(text)
	continue

	# Step 2: Information extraction & summarization
	summary, _ = summarize_resume_text(text)
	st.session_state.progress = 66

	# Step 3: Scoring
	score, breakdown = calculate_google_match_score(summary)
	st.session_state.progress = 100

	# Display results
	st.subheader("Analysis Complete!")
	st.markdown(f"Match Score: {score*100:.1f}%")
	# Add other displays...

	if st.session_state.progress < 100:
	st.progress(st.session_state.progress, 100)
	time.sleep(0.1) # Simulate progress update