Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 18

Commit

501c91b

verified ·

1 Parent(s): 82f85e7

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -133

app.py CHANGED Viewed

@@ -2,12 +2,10 @@ import os
 import io
 import streamlit as st
 import docx
 import time
-import tempfile
-import torch
-import transformers
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-import docx2txt
 # Set page title and hide sidebar
 st.set_page_config(
@@ -24,33 +22,18 @@ st.markdown("""
 """, unsafe_allow_html=True)
 #####################################
-# Optimized Model Loading
 #####################################
 @st.cache_resource(show_spinner=True)
 def load_models():
-    """Load models at startup with optimizations"""
     with st.spinner("Loading AI models... This may take a minute on first run."):
         models = {}
-        # Use half-precision for all models to reduce memory usage and increase speed
-        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-        device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
-        # Load a smaller summarization model
-        models['summarizer'] = pipeline(
-            "summarization",
-            model="facebook/bart-large-cnn",  # Faster model with good summarization quality
-            torch_dtype=torch_dtype,
-            device=device
-        )
-        # Use a smaller and faster text generation model
-        models['text_generator'] = pipeline(
-            "text-generation",
-            model="distilgpt2",  # Much smaller than GPT-2
-            torch_dtype=torch_dtype,
-            device=device
-        )
         return models
@@ -58,128 +41,91 @@ def load_models():
 models = load_models()
 #####################################
-# Function: Extract Text from File - Optimized
 #####################################
-@st.cache_data
-def extract_text_from_file(file_content, file_name):
     """
-    Extract text from .doc or .docx files.
     Returns the extracted text or an error message if extraction fails.
     """
-    ext = os.path.splitext(file_name)[1].lower()
     text = ""
     if ext == ".docx":
         try:
-            # Use BytesIO to avoid disk I/O
-            doc_file = io.BytesIO(file_content)
-            document = docx.Document(doc_file)
             text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
-    elif ext == ".doc":
         try:
-            # For .doc files, we need to save to a temp file
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
-                temp_file.write(file_content)
-                temp_path = temp_file.name
-            # Use docx2txt which is generally faster
-            try:
-                text = docx2txt.process(temp_path)
-            except Exception:
-                text = "Could not process .doc file. Please convert to .docx format."
-            # Clean up temp file
-            os.unlink(temp_path)
         except Exception as e:
-            text = f"Error processing DOC file: {e}"
     else:
-        text = "Unsupported file type. Please upload a .doc or .docx file."
     return text
 #####################################
-# Function: Summarize Resume Text - Optimized
 #####################################
 def summarize_resume_text(resume_text, models):
     """
-    Generates a concise summary of the resume text using an optimized approach.
     """
     start_time = time.time()
     summarizer = models['summarizer']
-    # Truncate text to avoid multiple passes
     max_input_length = 1024  # Model limit
-    truncated_text = resume_text[:max_input_length] if len(resume_text) > max_input_length else resume_text
-    # Get a concise summary in one pass
-    candidate_summary = summarizer(
-        truncated_text,
-        max_length=150,
-        min_length=30,
-        do_sample=False
-    )[0]['summary_text']
     execution_time = time.time() - start_time
     return candidate_summary, execution_time
 #####################################
-# Function: Generate Suitability Assessment - Optimized
 #####################################
-def generate_suitability_assessment(candidate_summary, company_prompt, models):
     """
-    Generate a suitability assessment using text generation - optimized.
     """
     start_time = time.time()
-    text_generator = models['text_generator']
-    # Create a shorter, more focused prompt
-    prompt = f"""Resume: {candidate_summary[:300]}...
-Company: {company_prompt[:300]}...
-Suitability Assessment: This candidate"""
-    # Generate shorter text for faster completion
-    max_length = 50 + len(prompt.split())
-    generated_text = text_generator(
-        prompt,
-        max_length=max_length,
-        num_return_sequences=1,
-        temperature=0.7,
-        top_p=0.9,
-        do_sample=True
-    )[0]['generated_text']
-    # Extract only the assessment part
-    assessment = generated_text[len(prompt):].strip()
-    # Determine a numerical score (simplified for better performance)
-    positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns', 'matches', 'suitable']
-    negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks']
-    assessment_lower = assessment.lower()
-    # Calculate score
-    positive_count = sum(1 for word in positive_words if word in assessment_lower)
-    negative_count = sum(1 for word in negative_words if word in assessment_lower)
-    total = positive_count + negative_count
-    if total > 0:
-        score = 0.5 + 0.4 * (positive_count - negative_count) / total
-    else:
-        score = 0.5
-    # Clamp the score
-    score = max(0.1, min(0.9, score))
     execution_time = time.time() - start_time
-    return assessment, score, execution_time
 #####################################
 # Main Streamlit Interface
@@ -187,15 +133,15 @@ Suitability Assessment: This candidate"""
 st.title("Resume Analyzer and Company Suitability Checker")
 st.markdown(
     """
-Upload your resume file in **.doc** or **.docx** format. The app performs the following tasks:
 1. Extracts text from the resume.
 2. Uses a transformer-based model to generate a concise candidate summary.
-3. Evaluates how well the candidate aligns with the company requirements.
 """
 )
 # File uploader
-uploaded_file = st.file_uploader("Upload your resume (.doc or .docx)", type=["doc", "docx"])
 # Company description text area
 company_prompt = st.text_area(
@@ -207,40 +153,35 @@ company_prompt = st.text_area(
 # Process button
 if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
     with st.spinner("Processing..."):
-        # Extract text from resume with caching
-        resume_text = extract_text_from_file(uploaded_file.getvalue(), uploaded_file.name)
-        if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
             st.error(resume_text)
         else:
-            # Add a progress bar
-            progress_bar = st.progress(0)
             # Generate summary
             summary, summarization_time = summarize_resume_text(resume_text, models)
-            progress_bar.progress(50)
             # Display summary
             st.subheader("Candidate Summary")
             st.write(summary)
             st.info(f"Summarization completed in {summarization_time:.2f} seconds")
-            # Generate suitability assessment
-            assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
-            progress_bar.progress(100)
-            # Display assessment
-            st.subheader("Suitability Assessment")
-            st.write(assessment)
-            st.markdown(f"**Estimated Matching Score:** {estimated_score:.2%}")
-            st.info(f"Assessment generated in {generation_time:.2f} seconds")
-            # Provide interpretation based on estimated score
-            if estimated_score >= 0.85:
-                st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
-            elif estimated_score >= 0.70:
-                st.success("Good match! This candidate shows strong potential for the position.")
-            elif estimated_score >= 0.50:
-                st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
-            else:
-                st.error("Low match. The candidate's profile may not align well with the requirements.")

 import io
 import streamlit as st
 import docx
+from transformers import pipeline
+import numpy as np
+from scipy.spatial.distance import cosine
 import time
 # Set page title and hide sidebar
 st.set_page_config(
 """, unsafe_allow_html=True)
 #####################################
+# Preload Models
 #####################################
 @st.cache_resource(show_spinner=True)
 def load_models():
+    """Load models at startup"""
     with st.spinner("Loading AI models... This may take a minute on first run."):
         models = {}
+        # Load summarization model
+        models['summarizer'] = pipeline("summarization", model="marianna13/flan-t5-base-summarization")
+        # Load feature extraction model for similarity
+        models['feature_extractor'] = pipeline("feature-extraction", model="albert-base-v2")
         return models
 models = load_models()
 #####################################
+# Function: Extract Text from File
 #####################################
+def extract_text_from_file(file_obj):
     """
+    Extract text from .docx files.
     Returns the extracted text or an error message if extraction fails.
     """
+    filename = file_obj.name
+    ext = os.path.splitext(filename)[1].lower()
     text = ""
     if ext == ".docx":
         try:
+            document = docx.Document(file_obj)
             text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
+    elif ext == ".txt":
         try:
+            text = file_obj.getvalue().decode("utf-8")
         except Exception as e:
+            text = f"Error processing TXT file: {e}"
     else:
+        text = "Unsupported file type. Please upload a .docx or .txt file."
     return text
 #####################################
+# Function: Summarize Resume Text
 #####################################
 def summarize_resume_text(resume_text, models):
     """
+    Generates a concise summary of the resume text using the selected summarization model.
     """
     start_time = time.time()
     summarizer = models['summarizer']
+    # Handle long text
     max_input_length = 1024  # Model limit
+    if len(resume_text) > max_input_length:
+        # Process in chunks if text is too long
+        chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
+        summaries = []
+        for chunk in chunks:
+            chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+            summaries.append(chunk_summary)
+        candidate_summary = " ".join(summaries)
+        if len(candidate_summary) > max_input_length:
+            candidate_summary = summarizer(candidate_summary[:max_input_length], max_length=150, min_length=40, do_sample=False)[0]['summary_text']
+    else:
+        candidate_summary = summarizer(resume_text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
     execution_time = time.time() - start_time
     return candidate_summary, execution_time
 #####################################
+# Function: Compare Candidate Summary to Company Prompt
 #####################################
+def compute_suitability(candidate_summary, company_prompt, models):
     """
+    Compute the similarity between candidate summary and company prompt.
+    Returns a score in the range [0, 1] and execution time.
     """
     start_time = time.time()
+    feature_extractor = models['feature_extractor']
+    # Extract features (embeddings)
+    candidate_features = feature_extractor(candidate_summary)
+    company_features = feature_extractor(company_prompt)
+    # Convert to numpy arrays and flatten if needed
+    candidate_vec = np.mean(np.array(candidate_features[0]), axis=0)
+    company_vec = np.mean(np.array(company_features[0]), axis=0)
+    # Compute cosine similarity (1 - cosine distance)
+    similarity = 1 - cosine(candidate_vec, company_vec)
     execution_time = time.time() - start_time
+    return similarity, execution_time
 #####################################
 # Main Streamlit Interface
 st.title("Resume Analyzer and Company Suitability Checker")
 st.markdown(
     """
+Upload your resume file in **.docx** or **.txt** format. The app performs the following tasks:
 1. Extracts text from the resume.
 2. Uses a transformer-based model to generate a concise candidate summary.
+3. Compares the candidate summary with a company profile to produce a suitability score.
 """
 )
 # File uploader
+uploaded_file = st.file_uploader("Upload your resume (.docx or .txt)", type=["docx", "txt"])
 # Company description text area
 company_prompt = st.text_area(
 # Process button
 if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
     with st.spinner("Processing..."):
+        # Extract text from resume
+        resume_text = extract_text_from_file(uploaded_file)
+        if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .docx or .txt file.":
             st.error(resume_text)
         else:
             # Generate summary
             summary, summarization_time = summarize_resume_text(resume_text, models)
             # Display summary
             st.subheader("Candidate Summary")
             st.write(summary)
             st.info(f"Summarization completed in {summarization_time:.2f} seconds")
+            # Only compute similarity if company description is provided
+            if company_prompt:
+                similarity_score, similarity_time = compute_suitability(summary, company_prompt, models)
+                # Display similarity score
+                st.subheader("Suitability Assessment")
+                st.markdown(f"**Matching Score:** {similarity_score:.2%}")
+                st.info(f"Similarity computation completed in {similarity_time:.2f} seconds")
+                # Provide interpretation
+                if similarity_score >= 0.85:
+                    st.success("Excellent match! This candidate's profile is strongly aligned with the company requirements.")
+                elif similarity_score >= 0.70:
+                    st.success("Good match! This candidate shows strong potential for the position.")
+                elif similarity_score >= 0.50:
+                    st.warning("Moderate match. The candidate meets some requirements but there may be gaps.")
+                else:
+                    st.error("Low match. The candidate's profile may not align well with the requirements.")