CR7CAD's picture
Update app.py
97150aa verified
raw
history blame
6.11 kB
import os
import io
import streamlit as st
import docx
import docx2txt
import tempfile
import time
import re
import concurrent.futures
from functools import lru_cache
from transformers import pipeline
from collections import defaultdict
# Set page title and hide sidebar
st.set_page_config(
page_title="Resume-Google Job Match Analyzer",
initial_sidebar_state="collapsed"
)
# Hide sidebar completely with custom CSS
st.markdown("""
<style>
[data-testid="collapsedControl"] {display: none;}
section[data-testid="stSidebar"] {display: none;}
</style>
""", unsafe_allow_html=True)
# Pre-defined company description for Google (unchanged)
GOOGLE_DESCRIPTION = """...""" # Keep your original content here
#####################################
# Preload Models - Optimized with DistilBART
#####################################
@st.cache_resource(show_spinner=True)
def load_models():
"""Load optimized models at startup"""
with st.spinner("Loading AI models..."):
models = {
'summarizer': pipeline(
"summarization",
model="distilbart-base-cs", # Faster smaller model
max_length=300,
truncation=True,
num_return_sequences=1
)
}
return models
models = load_models()
#####################################
# Function: Extract Text from File - Optimized
#####################################
@lru_cache(maxsize=16, typed=False)
def extract_text_from_file(file_obj):
"""Optimized text extraction with early exit"""
filename = file_obj.name
ext = os.path.splitext(filename)[1].lower()
text = ""
MAX_TEXT = 15000 # Reduced processing limit
try:
if ext == ".docx":
doc = docx.Document(file_obj)
text = "\n".join(para.text for para in doc.paragraphs[:50] if para.text.strip())[:MAX_TEXT]
elif ext == ".doc":
with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
temp_file.write(file_obj.getvalue())
text = docx2txt.process(temp_file.name)[:MAX_TEXT]
os.unlink(temp_file.name)
elif ext == ".txt":
text = file_obj.getvalue().decode("utf-8")[:MAX_TEXT]
except Exception as e:
text = f"Error: {str(e)}"
return text
#####################################
# Unified Information Extraction - Optimized
#####################################
@lru_cache(maxsize=16, typed=False)
def extract_info(text):
"""Combined extraction of all candidate info in one pass"""
text_lower = text.lower()
info = {
'name': extract_name_optimized(text),
'age': extract_age_optimized(text_lower),
'industry': extract_industry_optimized(text_lower),
'skills': extract_skills_optimized(text_lower),
'experience': extract_experience_optimized(text)
}
return info
def extract_name_optimized(text):
"""Faster name extraction with reduced checks"""
lines = text.split('\n')[:10]
for line in lines:
if 5 <= len(line) <= 40 and not any(keyword in line.lower() for keyword in ["resume", "cv"]):
return line.strip()
return "Unknown"
def extract_age_optimized(text):
"""Simplified age pattern matching"""
patterns = [r'\b(age)\b?:?\s*(\d{1,2})', r'(\d{1,2})\s+years? old']
for pattern in patterns:
match = re.search(pattern, text)
if match: return match.group(1)
return "Not specified"
# Other extract_ functions with similar optimizations...
#####################################
# Optimized Summarization
#####################################
def summarize_resume_text(resume_text):
"""Faster summarization with input truncation"""
base_summary = models['summarizer'](
resume_text[:1024],
max_length=150,
truncation=True
)[0]['summary_text']
with concurrent.futures.ThreadPoolExecutor() as executor:
info = executor.submit(extract_info, resume_text).result()
return f"**Name**: {info['name']}\n**Age**: {info['age']}\n**Industry**: {info['industry']}\n\n{base_summary}", 0.1
#####################################
# Optimized Scoring System
#####################################
def calculate_google_match_score(summary):
"""Precomputed keyword matching for faster scoring"""
GOOGLE_KEYWORDS = {
"Technical Skills": {"python", "java", "c++", "sql", "algorithms"},
"Advanced Tech": {"ai", "ml", "cloud", "data science"},
# Add other categories...
}
score = defaultdict(float)
summary_lower = summary.lower()
for category, keywords in GOOGLE_KEYWORDS.items():
count = len(keywords & set(summary_lower.split()))
score[category] = min(1, (count / len(keywords)) * 1.5 if keywords else 0)
return sum(score.values() * weights), score # weights defined accordingly
#####################################
# Streamlit Interface Optimizations
#####################################
st.title("Google Resume Analyzer")
st.session_state progress = 0
st.session_state.last_update = time.time()
if uploaded_file and st.button("Analyze"):
with st.spinner():
# Use session state for progress tracking
start_time = time.time()
# Step 1: Text extraction
text = extract_text_from_file(uploaded_file)
st.session_state.progress = 33
if "Error" in text:
st.error(text)
continue
# Step 2: Information extraction & summarization
summary, _ = summarize_resume_text(text)
st.session_state.progress = 66
# Step 3: Scoring
score, breakdown = calculate_google_match_score(summary)
st.session_state.progress = 100
# Display results
st.subheader("Analysis Complete!")
st.markdown(f"**Match Score**: {score*100:.1f}%")
# Add other displays...
if st.session_state.progress < 100:
st.progress(st.session_state.progress, 100)
time.sleep(0.1) # Simulate progress update