Spaces:
Sleeping
Sleeping
import os | |
import io | |
import streamlit as st | |
import docx | |
import docx2txt | |
import tempfile | |
import time | |
import re | |
import concurrent.futures | |
from functools import lru_cache | |
from transformers import pipeline | |
from collections import defaultdict | |
# Set page title and hide sidebar | |
st.set_page_config( | |
page_title="Resume-Google Job Match Analyzer", | |
initial_sidebar_state="collapsed" | |
) | |
# Hide sidebar completely with custom CSS | |
st.markdown(""" | |
<style> | |
[data-testid="collapsedControl"] {display: none;} | |
section[data-testid="stSidebar"] {display: none;} | |
</style> | |
""", unsafe_allow_html=True) | |
# Pre-defined company description for Google (unchanged) | |
GOOGLE_DESCRIPTION = """...""" # Keep your original content here | |
##################################### | |
# Preload Models - Optimized with DistilBART | |
##################################### | |
def load_models(): | |
"""Load optimized models at startup""" | |
with st.spinner("Loading AI models..."): | |
models = { | |
'summarizer': pipeline( | |
"summarization", | |
model="distilbart-base-cs", # Faster smaller model | |
max_length=300, | |
truncation=True, | |
num_return_sequences=1 | |
) | |
} | |
return models | |
models = load_models() | |
##################################### | |
# Function: Extract Text from File - Optimized | |
##################################### | |
def extract_text_from_file(file_obj): | |
"""Optimized text extraction with early exit""" | |
filename = file_obj.name | |
ext = os.path.splitext(filename)[1].lower() | |
text = "" | |
MAX_TEXT = 15000 # Reduced processing limit | |
try: | |
if ext == ".docx": | |
doc = docx.Document(file_obj) | |
text = "\n".join(para.text for para in doc.paragraphs[:50] if para.text.strip())[:MAX_TEXT] | |
elif ext == ".doc": | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file: | |
temp_file.write(file_obj.getvalue()) | |
text = docx2txt.process(temp_file.name)[:MAX_TEXT] | |
os.unlink(temp_file.name) | |
elif ext == ".txt": | |
text = file_obj.getvalue().decode("utf-8")[:MAX_TEXT] | |
except Exception as e: | |
text = f"Error: {str(e)}" | |
return text | |
##################################### | |
# Unified Information Extraction - Optimized | |
##################################### | |
def extract_info(text): | |
"""Combined extraction of all candidate info in one pass""" | |
text_lower = text.lower() | |
info = { | |
'name': extract_name_optimized(text), | |
'age': extract_age_optimized(text_lower), | |
'industry': extract_industry_optimized(text_lower), | |
'skills': extract_skills_optimized(text_lower), | |
'experience': extract_experience_optimized(text) | |
} | |
return info | |
def extract_name_optimized(text): | |
"""Faster name extraction with reduced checks""" | |
lines = text.split('\n')[:10] | |
for line in lines: | |
if 5 <= len(line) <= 40 and not any(keyword in line.lower() for keyword in ["resume", "cv"]): | |
return line.strip() | |
return "Unknown" | |
def extract_age_optimized(text): | |
"""Simplified age pattern matching""" | |
patterns = [r'\b(age)\b?:?\s*(\d{1,2})', r'(\d{1,2})\s+years? old'] | |
for pattern in patterns: | |
match = re.search(pattern, text) | |
if match: return match.group(1) | |
return "Not specified" | |
# Other extract_ functions with similar optimizations... | |
##################################### | |
# Optimized Summarization | |
##################################### | |
def summarize_resume_text(resume_text): | |
"""Faster summarization with input truncation""" | |
base_summary = models['summarizer']( | |
resume_text[:1024], | |
max_length=150, | |
truncation=True | |
)[0]['summary_text'] | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
info = executor.submit(extract_info, resume_text).result() | |
return f"**Name**: {info['name']}\n**Age**: {info['age']}\n**Industry**: {info['industry']}\n\n{base_summary}", 0.1 | |
##################################### | |
# Optimized Scoring System | |
##################################### | |
def calculate_google_match_score(summary): | |
"""Precomputed keyword matching for faster scoring""" | |
GOOGLE_KEYWORDS = { | |
"Technical Skills": {"python", "java", "c++", "sql", "algorithms"}, | |
"Advanced Tech": {"ai", "ml", "cloud", "data science"}, | |
# Add other categories... | |
} | |
score = defaultdict(float) | |
summary_lower = summary.lower() | |
for category, keywords in GOOGLE_KEYWORDS.items(): | |
count = len(keywords & set(summary_lower.split())) | |
score[category] = min(1, (count / len(keywords)) * 1.5 if keywords else 0) | |
return sum(score.values() * weights), score # weights defined accordingly | |
##################################### | |
# Streamlit Interface Optimizations | |
##################################### | |
st.title("Google Resume Analyzer") | |
st.session_state progress = 0 | |
st.session_state.last_update = time.time() | |
if uploaded_file and st.button("Analyze"): | |
with st.spinner(): | |
# Use session state for progress tracking | |
start_time = time.time() | |
# Step 1: Text extraction | |
text = extract_text_from_file(uploaded_file) | |
st.session_state.progress = 33 | |
if "Error" in text: | |
st.error(text) | |
continue | |
# Step 2: Information extraction & summarization | |
summary, _ = summarize_resume_text(text) | |
st.session_state.progress = 66 | |
# Step 3: Scoring | |
score, breakdown = calculate_google_match_score(summary) | |
st.session_state.progress = 100 | |
# Display results | |
st.subheader("Analysis Complete!") | |
st.markdown(f"**Match Score**: {score*100:.1f}%") | |
# Add other displays... | |
if st.session_state.progress < 100: | |
st.progress(st.session_state.progress, 100) | |
time.sleep(0.1) # Simulate progress update |