Spaces:

CR7CAD
/

ISOM5240FinalProject

Sleeping

App Files Files Community

CR7CAD commited on Mar 18

Commit

0807dc8

verified ·

1 Parent(s): 6713758

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -71

app.py CHANGED Viewed

@@ -2,10 +2,12 @@ import os
 import io
 import streamlit as st
 import docx
-from transformers import pipeline
 import time
 import tempfile
-import subprocess
 # Set page title and hide sidebar
 st.set_page_config(
@@ -22,18 +24,33 @@ st.markdown("""
 """, unsafe_allow_html=True)
 #####################################
-# Preload Models
 #####################################
 @st.cache_resource(show_spinner=True)
 def load_models():
-    """Load models at startup"""
     with st.spinner("Loading AI models... This may take a minute on first run."):
         models = {}
-        # Load summarization model
-        models['summarizer'] = pipeline("summarization", model="marianna13/flan-t5-base-summarization")
-        # Load text generation model for suitability assessment
-        models['text_generator'] = pipeline("text-generation", model="gpt2")
         return models
@@ -41,42 +58,37 @@ def load_models():
 models = load_models()
 #####################################
-# Function: Extract Text from File
 #####################################
-def extract_text_from_file(file_obj):
     """
     Extract text from .doc or .docx files.
     Returns the extracted text or an error message if extraction fails.
     """
-    filename = file_obj.name
-    ext = os.path.splitext(filename)[1].lower()
     text = ""
     if ext == ".docx":
         try:
-            document = docx.Document(file_obj)
             text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
         try:
-            # For .doc files, we need to save to a temp file and use an external tool
-            # This example uses antiword which needs to be installed in the environment
             with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
-                temp_file.write(file_obj.getvalue())
                 temp_path = temp_file.name
-            # Try using python-docx2txt if available
             try:
-                import docx2txt
                 text = docx2txt.process(temp_path)
-            except ImportError:
-                # Fallback to antiword if installed
-                try:
-                    text = subprocess.check_output(['antiword', temp_path]).decode('utf-8')
-                except:
-                    # If all else fails, inform the user
-                    text = "Could not process .doc file. Please convert to .docx format."
             # Clean up temp file
             os.unlink(temp_path)
@@ -88,61 +100,52 @@ def extract_text_from_file(file_obj):
     return text
 #####################################
-# Function: Summarize Resume Text
 #####################################
 def summarize_resume_text(resume_text, models):
     """
-    Generates a concise summary of the resume text using the selected summarization model.
     """
     start_time = time.time()
     summarizer = models['summarizer']
-    # Handle long text
     max_input_length = 1024  # Model limit
-    if len(resume_text) > max_input_length:
-        # Process in chunks if text is too long
-        chunks = [resume_text[i:i+max_input_length] for i in range(0, min(len(resume_text), 3*max_input_length), max_input_length)]
-        summaries = []
-        for chunk in chunks:
-            chunk_summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
-            summaries.append(chunk_summary)
-        candidate_summary = " ".join(summaries)
-        if len(candidate_summary) > max_input_length:
-            candidate_summary = summarizer(candidate_summary[:max_input_length], max_length=150, min_length=40, do_sample=False)[0]['summary_text']
-    else:
-        candidate_summary = summarizer(resume_text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
     execution_time = time.time() - start_time
     return candidate_summary, execution_time
 #####################################
-# Function: Generate Suitability Assessment
 #####################################
 def generate_suitability_assessment(candidate_summary, company_prompt, models):
     """
-    Generate a suitability assessment using text generation.
-    Returns the generated assessment text and execution time.
     """
     start_time = time.time()
     text_generator = models['text_generator']
-    # Create a prompt for the text generation model that focuses on candidate alignment with company
-    prompt = f"""
-Resume Summary: {candidate_summary}
-Company Description: {company_prompt}
-Suitability Assessment:
-Based on an analysis of the candidate's profile compared to the company requirements, this candidate"""
-    # Generate text
-    max_length = 100 + len(prompt.split())  # Limit output length
     generated_text = text_generator(
         prompt,
         max_length=max_length,
@@ -152,28 +155,26 @@ Based on an analysis of the candidate's profile compared to the company requirem
         do_sample=True
     )[0]['generated_text']
-    # Extract only the assessment part (after the prompt)
     assessment = generated_text[len(prompt):].strip()
-    # Determine a numerical score from the text
-    # This is a simplified approach - we're looking for positive and negative words
-    positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns well', 'matches', 'suitable']
-    negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks', 'does not align']
     assessment_lower = assessment.lower()
-    # Simple heuristic for score estimation
-    score = 0.5  # Default middle score
-    for word in positive_words:
-        if word in assessment_lower:
-            score += 0.1  # Increase score for positive words
-    for word in negative_words:
-        if word in assessment_lower:
-            score -= 0.1  # Decrease score for negative words
-    # Clamp the score between 0 and 1
     score = max(0.1, min(0.9, score))
     execution_time = time.time() - start_time
@@ -206,22 +207,27 @@ company_prompt = st.text_area(
 # Process button
 if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
     with st.spinner("Processing..."):
-        # Extract text from resume
-        resume_text = extract_text_from_file(uploaded_file)
         if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
             st.error(resume_text)
         else:
             # Generate summary
             summary, summarization_time = summarize_resume_text(resume_text, models)
             # Display summary
             st.subheader("Candidate Summary")
             st.write(summary)
             st.info(f"Summarization completed in {summarization_time:.2f} seconds")
-            # Generate suitability assessment with text generation
             assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
             # Display assessment
             st.subheader("Suitability Assessment")

 import io
 import streamlit as st
 import docx
 import time
 import tempfile
+import torch
+import transformers
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import docx2txt
 # Set page title and hide sidebar
 st.set_page_config(
 """, unsafe_allow_html=True)
 #####################################
+# Optimized Model Loading
 #####################################
 @st.cache_resource(show_spinner=True)
 def load_models():
+    """Load models at startup with optimizations"""
     with st.spinner("Loading AI models... This may take a minute on first run."):
         models = {}
+        # Use half-precision for all models to reduce memory usage and increase speed
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
+        # Load a smaller summarization model
+        models['summarizer'] = pipeline(
+            "summarization",
+            model="facebook/bart-large-cnn",  # Faster model with good summarization quality
+            torch_dtype=torch_dtype,
+            device=device
+        )
+        # Use a smaller and faster text generation model
+        models['text_generator'] = pipeline(
+            "text-generation",
+            model="distilgpt2",  # Much smaller than GPT-2
+            torch_dtype=torch_dtype,
+            device=device
+        )
         return models
 models = load_models()
 #####################################
+# Function: Extract Text from File - Optimized
 #####################################
+@st.cache_data
+def extract_text_from_file(file_content, file_name):
     """
     Extract text from .doc or .docx files.
     Returns the extracted text or an error message if extraction fails.
     """
+    ext = os.path.splitext(file_name)[1].lower()
     text = ""
     if ext == ".docx":
         try:
+            # Use BytesIO to avoid disk I/O
+            doc_file = io.BytesIO(file_content)
+            document = docx.Document(doc_file)
             text = "\n".join(para.text for para in document.paragraphs if para.text.strip())
         except Exception as e:
             text = f"Error processing DOCX file: {e}"
     elif ext == ".doc":
         try:
+            # For .doc files, we need to save to a temp file
             with tempfile.NamedTemporaryFile(delete=False, suffix='.doc') as temp_file:
+                temp_file.write(file_content)
                 temp_path = temp_file.name
+            # Use docx2txt which is generally faster
             try:
                 text = docx2txt.process(temp_path)
+            except Exception:
+                text = "Could not process .doc file. Please convert to .docx format."
             # Clean up temp file
             os.unlink(temp_path)
     return text
 #####################################
+# Function: Summarize Resume Text - Optimized
 #####################################
 def summarize_resume_text(resume_text, models):
     """
+    Generates a concise summary of the resume text using an optimized approach.
     """
     start_time = time.time()
     summarizer = models['summarizer']
+    # Truncate text to avoid multiple passes
     max_input_length = 1024  # Model limit
+    truncated_text = resume_text[:max_input_length] if len(resume_text) > max_input_length else resume_text
+    # Get a concise summary in one pass
+    candidate_summary = summarizer(
+        truncated_text,
+        max_length=150,
+        min_length=30,
+        do_sample=False
+    )[0]['summary_text']
     execution_time = time.time() - start_time
     return candidate_summary, execution_time
 #####################################
+# Function: Generate Suitability Assessment - Optimized
 #####################################
 def generate_suitability_assessment(candidate_summary, company_prompt, models):
     """
+    Generate a suitability assessment using text generation - optimized.
     """
     start_time = time.time()
     text_generator = models['text_generator']
+    # Create a shorter, more focused prompt
+    prompt = f"""Resume: {candidate_summary[:300]}...
+Company: {company_prompt[:300]}...
+Suitability Assessment: This candidate"""
+    # Generate shorter text for faster completion
+    max_length = 50 + len(prompt.split())
     generated_text = text_generator(
         prompt,
         max_length=max_length,
         do_sample=True
     )[0]['generated_text']
+    # Extract only the assessment part
     assessment = generated_text[len(prompt):].strip()
+    # Determine a numerical score (simplified for better performance)
+    positive_words = ['excellent', 'perfect', 'great', 'good', 'strong', 'ideal', 'qualified', 'aligns', 'matches', 'suitable']
+    negative_words = ['poor', 'weak', 'bad', 'insufficient', 'inadequate', 'not a good fit', 'misaligned', 'lacks']
     assessment_lower = assessment.lower()
+    # Calculate score
+    positive_count = sum(1 for word in positive_words if word in assessment_lower)
+    negative_count = sum(1 for word in negative_words if word in assessment_lower)
+    total = positive_count + negative_count
+    if total > 0:
+        score = 0.5 + 0.4 * (positive_count - negative_count) / total
+    else:
+        score = 0.5
+    # Clamp the score
     score = max(0.1, min(0.9, score))
     execution_time = time.time() - start_time
 # Process button
 if uploaded_file is not None and company_prompt and st.button("Analyze Resume"):
     with st.spinner("Processing..."):
+        # Extract text from resume with caching
+        resume_text = extract_text_from_file(uploaded_file.getvalue(), uploaded_file.name)
         if resume_text.startswith("Error") or resume_text == "Unsupported file type. Please upload a .doc or .docx file.":
             st.error(resume_text)
         else:
+            # Add a progress bar
+            progress_bar = st.progress(0)
             # Generate summary
             summary, summarization_time = summarize_resume_text(resume_text, models)
+            progress_bar.progress(50)
             # Display summary
             st.subheader("Candidate Summary")
             st.write(summary)
             st.info(f"Summarization completed in {summarization_time:.2f} seconds")
+            # Generate suitability assessment
             assessment, estimated_score, generation_time = generate_suitability_assessment(summary, company_prompt, models)
+            progress_bar.progress(100)
             # Display assessment
             st.subheader("Suitability Assessment")