Spaces:

jacob-c
/

Resume_Screener_and_Skill_Extractor

Paused

App Files Files Community

root commited on May 21

Commit

53cdf96

1 Parent(s): 9879a34

ss

Browse files

Files changed (3) hide show

app.py +318 -13
explanation_generator.py +3 -2
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -63,6 +63,19 @@ with st.sidebar:
     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
     st.markdown("---")
     st.markdown("### About")
     st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
@@ -102,8 +115,8 @@ class ResumeScreener:
                     if "sentence-transformers" in self.embedding_model_name:
                         self.model = SentenceTransformer(self.embedding_model_name)
                     else:
-                        self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name)
-                        self.model = AutoModel.from_pretrained(self.embedding_model_name)
                     st.session_state.embedding_model = self.model
                     st.session_state.tokenizer = self.tokenizer
@@ -430,6 +443,35 @@ def get_csv_download_link(df, filename="results.csv"):
     href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
     return href
 # Main app UI
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
@@ -449,7 +491,7 @@ job_description = st.text_area(
 st.header("2. Upload Resumes")
 upload_option = st.radio(
     "Choose upload method:",
-    ["Upload Files", "Upload from Dataset"]
 )
 uploaded_files = []
@@ -483,16 +525,279 @@ if upload_option == "Upload Files":
         st.session_state.resumes_uploaded = True
         st.success(f"Successfully processed {len(resume_texts)} resumes.")
-else:
-    st.write("Upload from dataset feature will be implemented soon.")
-    # Here you would implement the connection to Hugging Face datasets
-    # Example pseudocode:
-    # dataset_name = st.text_input("Enter Hugging Face dataset name:")
-    # if st.button("Load Dataset"):
-    #     with st.spinner("Loading dataset..."):
-    #         dataset = load_dataset(dataset_name)
-    #         resume_texts = [item["text"] for item in dataset]
-    #         file_names = [f"resume_{i}.txt" for i in range(len(resume_texts))]
 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):

     use_explanation = st.checkbox("Generate Explanations", value=True)
     use_faiss = st.checkbox("Use FAISS for fast search", value=True)
+    # Memory optimization options
+    st.subheader("Memory Optimization")
+    memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
+    clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
+    gc_collect_interval = st.number_input(
+        "Garbage collection interval (files)",
+        min_value=10,
+        max_value=1000,
+        value=100,
+        step=10,
+        help="Run garbage collection after processing this many files"
+    )
     st.markdown("---")
     st.markdown("### About")
     st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
                     if "sentence-transformers" in self.embedding_model_name:
                         self.model = SentenceTransformer(self.embedding_model_name)
                     else:
+                        self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
+                        self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
                     st.session_state.embedding_model = self.model
                     st.session_state.tokenizer = self.tokenizer
     href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
     return href
+# Add this new function after the get_csv_download_link function
+def get_huggingface_spaces_datasets():
+    """Check for datasets in Hugging Face Spaces environment"""
+    datasets = []
+    # Common dataset paths in Hugging Face Spaces
+    potential_paths = [
+        "/data",                 # Common mount point
+        "data",                  # Relative path
+        os.path.expanduser("~/data"),  # Home directory
+    ]
+    for path in potential_paths:
+        if os.path.exists(path) and os.path.isdir(path):
+            # Look for CSV files
+            csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
+            for csv_file in csv_files:
+                datasets.append(os.path.join(path, csv_file))
+            # Look for directories that might contain PDFs
+            for subdir in os.listdir(path):
+                subdir_path = os.path.join(path, subdir)
+                if os.path.isdir(subdir_path):
+                    pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
+                    if pdf_count > 0:
+                        datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
+    return datasets
 # Main app UI
 st.title("Resume Screener & Skill Extractor")
 st.markdown("---")
 st.header("2. Upload Resumes")
 upload_option = st.radio(
     "Choose upload method:",
+    ["Upload Files", "Upload from Dataset", "Process Directory"]
 )
 uploaded_files = []
         st.session_state.resumes_uploaded = True
         st.success(f"Successfully processed {len(resume_texts)} resumes.")
+elif upload_option == "Process Directory":
+    st.write("Process resume files from a directory on the server.")
+    # Input for directory path
+    resume_dir = st.text_input(
+        "Enter the path to the directory containing resume files:",
+        help="For Hugging Face Spaces, this could be a mounted directory or dataset."
+    )
+    # Limit batch size
+    batch_size = st.number_input(
+        "Number of files to process per batch (lower for less memory usage):",
+        min_value=10,
+        max_value=1000,
+        value=100,
+        step=10
+    )
+    # File types to process
+    file_types = st.multiselect(
+        "Select file types to process:",
+        ["pdf", "docx", "txt", "csv"],
+        default=["pdf"]
+    )
+    if resume_dir and st.button("Process Directory"):
+        if os.path.isdir(resume_dir):
+            # Get all files matching the selected types
+            all_files = []
+            for file_type in file_types:
+                all_files.extend([
+                    os.path.join(resume_dir, f)
+                    for f in os.listdir(resume_dir)
+                    if f.lower().endswith(f'.{file_type}')
+                ])
+            if all_files:
+                total_files = len(all_files)
+                st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
+                # Process in batches
+                processed_count = 0
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                for i in range(0, total_files, batch_size):
+                    batch_files = all_files[i:i+batch_size]
+                    for j, file_path in enumerate(batch_files):
+                        try:
+                            file_type = file_path.split('.')[-1].lower()
+                            text = screener.extract_text_from_file(file_path, file_type)
+                            if text:
+                                resume_texts.append(text)
+                                file_names.append(os.path.basename(file_path))
+                                processed_count += 1
+                                # Apply memory optimization if enabled
+                                if memory_optimization and j % gc_collect_interval == 0 and j > 0:
+                                    import gc
+                                    gc.collect()
+                                    status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
+                        except Exception as e:
+                            st.warning(f"Error processing {file_path}: {str(e)}")
+                    # Update progress
+                    progress = min(1.0, (i + len(batch_files)) / total_files)
+                    progress_bar.progress(progress)
+                    status_text.text(f"Processed {processed_count}/{total_files} files...")
+                    # Run garbage collection between batches if memory optimization is enabled
+                    if memory_optimization:
+                        import gc
+                        gc.collect()
+                # Final garbage collection if memory optimization is enabled
+                if memory_optimization:
+                    import gc
+                    gc.collect()
+                st.session_state.resumes_uploaded = True
+                st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
+            else:
+                st.error(f"No matching files found in {resume_dir}")
+        else:
+            st.error(f"Directory {resume_dir} does not exist or is not accessible.")
+elif upload_option == "Upload from Dataset":
+    # Upload from Dataset implementation
+    st.write("Upload a CSV file containing resume data or load from available datasets.")
+    # Check for available datasets in Hugging Face Spaces
+    hf_datasets = get_huggingface_spaces_datasets()
+    if hf_datasets:
+        st.subheader("Available Datasets in Hugging Face Spaces")
+        dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
+        selected_dataset = st.selectbox("Select a dataset:", dataset_options)
+        if selected_dataset != "None":
+            selected_index = dataset_options.index(selected_dataset) - 1  # Adjust for "None"
+            dataset_path = hf_datasets[selected_index]
+            if isinstance(dataset_path, tuple):
+                # It's a PDF directory
+                pdf_dir = dataset_path[0]
+                st.write(f"Selected PDF directory: {pdf_dir}")
+                batch_size = st.number_input(
+                    "Number of files to process per batch:",
+                    min_value=10,
+                    max_value=1000,
+                    value=100,
+                    step=10
+                )
+                if st.button("Process PDF Directory"):
+                    # Use the same processing logic as in the "Process Directory" option
+                    if os.path.isdir(pdf_dir):
+                        all_files = [
+                            os.path.join(pdf_dir, f)
+                            for f in os.listdir(pdf_dir)
+                            if f.lower().endswith('.pdf')
+                        ]
+                        if all_files:
+                            total_files = len(all_files)
+                            st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
+                            # Process in batches
+                            processed_count = 0
+                            progress_bar = st.progress(0)
+                            status_text = st.empty()
+                            for i in range(0, total_files, batch_size):
+                                batch_files = all_files[i:i+batch_size]
+                                for j, file_path in enumerate(batch_files):
+                                    try:
+                                        text = screener.extract_text_from_file(file_path, "pdf")
+                                        if text:
+                                            resume_texts.append(text)
+                                            file_names.append(os.path.basename(file_path))
+                                            processed_count += 1
+                                            # Apply memory optimization if enabled
+                                            if memory_optimization and j % gc_collect_interval == 0 and j > 0:
+                                                import gc
+                                                gc.collect()
+                                    except Exception as e:
+                                        st.warning(f"Error processing {file_path}: {str(e)}")
+                                # Update progress
+                                progress = min(1.0, (i + len(batch_files)) / total_files)
+                                progress_bar.progress(progress)
+                                status_text.text(f"Processed {processed_count}/{total_files} files...")
+                                # Memory optimization
+                                if memory_optimization:
+                                    import gc
+                                    gc.collect()
+                            st.session_state.resumes_uploaded = True
+                            st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
+            else:
+                # It's a CSV file
+                st.write(f"Selected CSV dataset: {dataset_path}")
+                try:
+                    # Read the CSV file
+                    df = pd.read_csv(dataset_path)
+                    # Let user select which column contains the resume text
+                    text_column = st.selectbox(
+                        "Select column containing resume text:",
+                        df.columns.tolist()
+                    )
+                    if st.button("Process Selected CSV"):
+                        # Extract text from the selected column
+                        for i, row in df.iterrows():
+                            text = str(row[text_column])
+                            if text and not pd.isna(text):
+                                resume_texts.append(text)
+                                # Use index as filename if no filename column
+                                file_name = f"resume_{i}.txt"
+                                if 'filename' in df.columns:
+                                    file_name = row['filename']
+                                file_names.append(file_name)
+                        st.session_state.resumes_uploaded = True
+                        st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
+                except Exception as e:
+                    st.error(f"Error processing CSV: {str(e)}")
+    # Rest of the existing Upload from Dataset code
+    dataset_option = st.radio(
+        "Dataset source:",
+        ["Upload CSV", "Use Hugging Face Dataset"]
+    )
+    if dataset_option == "Upload CSV":
+        csv_file = st.file_uploader(
+            "Upload CSV file containing resume data",
+            type=["csv"],
+            help="CSV should contain at least a column with resume text."
+        )
+        if csv_file:
+            with st.spinner("Processing CSV data..."):
+                # Read the CSV file
+                df = pd.read_csv(csv_file)
+                # Let user select which column contains the resume text
+                text_column = st.selectbox(
+                    "Select column containing resume text:",
+                    df.columns.tolist()
+                )
+                if st.button("Process Dataset"):
+                    # Extract text from the selected column
+                    for i, row in df.iterrows():
+                        text = str(row[text_column])
+                        if text and not pd.isna(text):
+                            resume_texts.append(text)
+                            # Use index as filename if no filename column
+                            file_name = f"resume_{i}.txt"
+                            if 'filename' in df.columns:
+                                file_name = row['filename']
+                            file_names.append(file_name)
+                    st.session_state.resumes_uploaded = True
+                    st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
+    else:
+        # Hugging Face Dataset option
+        dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
+        split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
+        if dataset_name and st.button("Load Dataset"):
+            with st.spinner(f"Loading dataset {dataset_name}..."):
+                try:
+                    from datasets import load_dataset
+                    # Load the dataset
+                    dataset = load_dataset(dataset_name, split=split)
+                    # Display dataset info
+                    st.write(f"Dataset loaded with {len(dataset)} entries.")
+                    # Let user select which column contains the resume text
+                    if len(dataset.column_names) > 0:
+                        text_column = st.selectbox(
+                            "Select column containing resume text:",
+                            dataset.column_names
+                        )
+                        if st.button("Process Hugging Face Dataset"):
+                            # Extract text from the selected column
+                            for i, item in enumerate(dataset):
+                                if text_column in item:
+                                    text = str(item[text_column])
+                                    if text:
+                                        resume_texts.append(text)
+                                        # Use index or id field as filename
+                                        file_name = f"resume_{i}.txt"
+                                        if 'id' in item:
+                                            file_name = f"resume_{item['id']}.txt"
+                                        file_names.append(file_name)
+                            st.session_state.resumes_uploaded = True
+                            st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
+                except Exception as e:
+                    st.error(f"Error loading dataset: {str(e)}")
+                    st.info("Make sure you have the 'datasets' library installed: pip install datasets")
 # Process button
 if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):

explanation_generator.py CHANGED Viewed

@@ -34,14 +34,15 @@ class ExplanationGenerator:
                     device = "cpu"
                 # Load tokenizer
-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
                 # Load model based on available resources
                 if device == "cuda":
                     self.model = AutoModelForCausalLM.from_pretrained(
                         self.model_name,
                         torch_dtype=torch.bfloat16,
-                        device_map="auto"
                     )
                 else:
                     # Fall back to a simpler template-based solution if we can't load the model

                     device = "cpu"
                 # Load tokenizer
+                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
                 # Load model based on available resources
                 if device == "cuda":
                     self.model = AutoModelForCausalLM.from_pretrained(
                         self.model_name,
                         torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        trust_remote_code=True
                     )
                 else:
                     # Fall back to a simpler template-based solution if we can't load the model

requirements.txt CHANGED Viewed

@@ -15,3 +15,4 @@ pandas==2.1.3
 numpy==1.24.3
 tqdm==4.66.1
 huggingface-hub==0.25.0

 numpy==1.24.3
 tqdm==4.66.1
 huggingface-hub==0.25.0
+einops