root
commited on
Commit
·
53cdf96
1
Parent(s):
9879a34
ss
Browse files- app.py +318 -13
- explanation_generator.py +3 -2
- requirements.txt +1 -0
app.py
CHANGED
@@ -63,6 +63,19 @@ with st.sidebar:
|
|
63 |
use_explanation = st.checkbox("Generate Explanations", value=True)
|
64 |
use_faiss = st.checkbox("Use FAISS for fast search", value=True)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
st.markdown("---")
|
67 |
st.markdown("### About")
|
68 |
st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
|
@@ -102,8 +115,8 @@ class ResumeScreener:
|
|
102 |
if "sentence-transformers" in self.embedding_model_name:
|
103 |
self.model = SentenceTransformer(self.embedding_model_name)
|
104 |
else:
|
105 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name)
|
106 |
-
self.model = AutoModel.from_pretrained(self.embedding_model_name)
|
107 |
|
108 |
st.session_state.embedding_model = self.model
|
109 |
st.session_state.tokenizer = self.tokenizer
|
@@ -430,6 +443,35 @@ def get_csv_download_link(df, filename="results.csv"):
|
|
430 |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
|
431 |
return href
|
432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
# Main app UI
|
434 |
st.title("Resume Screener & Skill Extractor")
|
435 |
st.markdown("---")
|
@@ -449,7 +491,7 @@ job_description = st.text_area(
|
|
449 |
st.header("2. Upload Resumes")
|
450 |
upload_option = st.radio(
|
451 |
"Choose upload method:",
|
452 |
-
["Upload Files", "Upload from Dataset"]
|
453 |
)
|
454 |
|
455 |
uploaded_files = []
|
@@ -483,16 +525,279 @@ if upload_option == "Upload Files":
|
|
483 |
|
484 |
st.session_state.resumes_uploaded = True
|
485 |
st.success(f"Successfully processed {len(resume_texts)} resumes.")
|
486 |
-
|
487 |
-
st.write("
|
488 |
-
|
489 |
-
#
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
# Process button
|
498 |
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
|
|
|
63 |
use_explanation = st.checkbox("Generate Explanations", value=True)
|
64 |
use_faiss = st.checkbox("Use FAISS for fast search", value=True)
|
65 |
|
66 |
+
# Memory optimization options
|
67 |
+
st.subheader("Memory Optimization")
|
68 |
+
memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
|
69 |
+
clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
|
70 |
+
gc_collect_interval = st.number_input(
|
71 |
+
"Garbage collection interval (files)",
|
72 |
+
min_value=10,
|
73 |
+
max_value=1000,
|
74 |
+
value=100,
|
75 |
+
step=10,
|
76 |
+
help="Run garbage collection after processing this many files"
|
77 |
+
)
|
78 |
+
|
79 |
st.markdown("---")
|
80 |
st.markdown("### About")
|
81 |
st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
|
|
|
115 |
if "sentence-transformers" in self.embedding_model_name:
|
116 |
self.model = SentenceTransformer(self.embedding_model_name)
|
117 |
else:
|
118 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
|
119 |
+
self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
|
120 |
|
121 |
st.session_state.embedding_model = self.model
|
122 |
st.session_state.tokenizer = self.tokenizer
|
|
|
443 |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
|
444 |
return href
|
445 |
|
446 |
+
# Add this new function after the get_csv_download_link function
|
447 |
+
def get_huggingface_spaces_datasets():
|
448 |
+
"""Check for datasets in Hugging Face Spaces environment"""
|
449 |
+
datasets = []
|
450 |
+
|
451 |
+
# Common dataset paths in Hugging Face Spaces
|
452 |
+
potential_paths = [
|
453 |
+
"/data", # Common mount point
|
454 |
+
"data", # Relative path
|
455 |
+
os.path.expanduser("~/data"), # Home directory
|
456 |
+
]
|
457 |
+
|
458 |
+
for path in potential_paths:
|
459 |
+
if os.path.exists(path) and os.path.isdir(path):
|
460 |
+
# Look for CSV files
|
461 |
+
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
|
462 |
+
for csv_file in csv_files:
|
463 |
+
datasets.append(os.path.join(path, csv_file))
|
464 |
+
|
465 |
+
# Look for directories that might contain PDFs
|
466 |
+
for subdir in os.listdir(path):
|
467 |
+
subdir_path = os.path.join(path, subdir)
|
468 |
+
if os.path.isdir(subdir_path):
|
469 |
+
pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
|
470 |
+
if pdf_count > 0:
|
471 |
+
datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
|
472 |
+
|
473 |
+
return datasets
|
474 |
+
|
475 |
# Main app UI
|
476 |
st.title("Resume Screener & Skill Extractor")
|
477 |
st.markdown("---")
|
|
|
491 |
st.header("2. Upload Resumes")
|
492 |
upload_option = st.radio(
|
493 |
"Choose upload method:",
|
494 |
+
["Upload Files", "Upload from Dataset", "Process Directory"]
|
495 |
)
|
496 |
|
497 |
uploaded_files = []
|
|
|
525 |
|
526 |
st.session_state.resumes_uploaded = True
|
527 |
st.success(f"Successfully processed {len(resume_texts)} resumes.")
|
528 |
+
elif upload_option == "Process Directory":
|
529 |
+
st.write("Process resume files from a directory on the server.")
|
530 |
+
|
531 |
+
# Input for directory path
|
532 |
+
resume_dir = st.text_input(
|
533 |
+
"Enter the path to the directory containing resume files:",
|
534 |
+
help="For Hugging Face Spaces, this could be a mounted directory or dataset."
|
535 |
+
)
|
536 |
+
|
537 |
+
# Limit batch size
|
538 |
+
batch_size = st.number_input(
|
539 |
+
"Number of files to process per batch (lower for less memory usage):",
|
540 |
+
min_value=10,
|
541 |
+
max_value=1000,
|
542 |
+
value=100,
|
543 |
+
step=10
|
544 |
+
)
|
545 |
+
|
546 |
+
# File types to process
|
547 |
+
file_types = st.multiselect(
|
548 |
+
"Select file types to process:",
|
549 |
+
["pdf", "docx", "txt", "csv"],
|
550 |
+
default=["pdf"]
|
551 |
+
)
|
552 |
+
|
553 |
+
if resume_dir and st.button("Process Directory"):
|
554 |
+
if os.path.isdir(resume_dir):
|
555 |
+
# Get all files matching the selected types
|
556 |
+
all_files = []
|
557 |
+
for file_type in file_types:
|
558 |
+
all_files.extend([
|
559 |
+
os.path.join(resume_dir, f)
|
560 |
+
for f in os.listdir(resume_dir)
|
561 |
+
if f.lower().endswith(f'.{file_type}')
|
562 |
+
])
|
563 |
+
|
564 |
+
if all_files:
|
565 |
+
total_files = len(all_files)
|
566 |
+
st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
|
567 |
+
|
568 |
+
# Process in batches
|
569 |
+
processed_count = 0
|
570 |
+
progress_bar = st.progress(0)
|
571 |
+
status_text = st.empty()
|
572 |
+
|
573 |
+
for i in range(0, total_files, batch_size):
|
574 |
+
batch_files = all_files[i:i+batch_size]
|
575 |
+
|
576 |
+
for j, file_path in enumerate(batch_files):
|
577 |
+
try:
|
578 |
+
file_type = file_path.split('.')[-1].lower()
|
579 |
+
text = screener.extract_text_from_file(file_path, file_type)
|
580 |
+
if text:
|
581 |
+
resume_texts.append(text)
|
582 |
+
file_names.append(os.path.basename(file_path))
|
583 |
+
processed_count += 1
|
584 |
+
|
585 |
+
# Apply memory optimization if enabled
|
586 |
+
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
|
587 |
+
import gc
|
588 |
+
gc.collect()
|
589 |
+
status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
|
590 |
+
except Exception as e:
|
591 |
+
st.warning(f"Error processing {file_path}: {str(e)}")
|
592 |
+
|
593 |
+
# Update progress
|
594 |
+
progress = min(1.0, (i + len(batch_files)) / total_files)
|
595 |
+
progress_bar.progress(progress)
|
596 |
+
status_text.text(f"Processed {processed_count}/{total_files} files...")
|
597 |
+
|
598 |
+
# Run garbage collection between batches if memory optimization is enabled
|
599 |
+
if memory_optimization:
|
600 |
+
import gc
|
601 |
+
gc.collect()
|
602 |
+
|
603 |
+
# Final garbage collection if memory optimization is enabled
|
604 |
+
if memory_optimization:
|
605 |
+
import gc
|
606 |
+
gc.collect()
|
607 |
+
|
608 |
+
st.session_state.resumes_uploaded = True
|
609 |
+
st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
|
610 |
+
else:
|
611 |
+
st.error(f"No matching files found in {resume_dir}")
|
612 |
+
else:
|
613 |
+
st.error(f"Directory {resume_dir} does not exist or is not accessible.")
|
614 |
+
elif upload_option == "Upload from Dataset":
|
615 |
+
# Upload from Dataset implementation
|
616 |
+
st.write("Upload a CSV file containing resume data or load from available datasets.")
|
617 |
+
|
618 |
+
# Check for available datasets in Hugging Face Spaces
|
619 |
+
hf_datasets = get_huggingface_spaces_datasets()
|
620 |
+
|
621 |
+
if hf_datasets:
|
622 |
+
st.subheader("Available Datasets in Hugging Face Spaces")
|
623 |
+
dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
|
624 |
+
selected_dataset = st.selectbox("Select a dataset:", dataset_options)
|
625 |
+
|
626 |
+
if selected_dataset != "None":
|
627 |
+
selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
|
628 |
+
dataset_path = hf_datasets[selected_index]
|
629 |
+
|
630 |
+
if isinstance(dataset_path, tuple):
|
631 |
+
# It's a PDF directory
|
632 |
+
pdf_dir = dataset_path[0]
|
633 |
+
st.write(f"Selected PDF directory: {pdf_dir}")
|
634 |
+
|
635 |
+
batch_size = st.number_input(
|
636 |
+
"Number of files to process per batch:",
|
637 |
+
min_value=10,
|
638 |
+
max_value=1000,
|
639 |
+
value=100,
|
640 |
+
step=10
|
641 |
+
)
|
642 |
+
|
643 |
+
if st.button("Process PDF Directory"):
|
644 |
+
# Use the same processing logic as in the "Process Directory" option
|
645 |
+
if os.path.isdir(pdf_dir):
|
646 |
+
all_files = [
|
647 |
+
os.path.join(pdf_dir, f)
|
648 |
+
for f in os.listdir(pdf_dir)
|
649 |
+
if f.lower().endswith('.pdf')
|
650 |
+
]
|
651 |
+
|
652 |
+
if all_files:
|
653 |
+
total_files = len(all_files)
|
654 |
+
st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
|
655 |
+
|
656 |
+
# Process in batches
|
657 |
+
processed_count = 0
|
658 |
+
progress_bar = st.progress(0)
|
659 |
+
status_text = st.empty()
|
660 |
+
|
661 |
+
for i in range(0, total_files, batch_size):
|
662 |
+
batch_files = all_files[i:i+batch_size]
|
663 |
+
|
664 |
+
for j, file_path in enumerate(batch_files):
|
665 |
+
try:
|
666 |
+
text = screener.extract_text_from_file(file_path, "pdf")
|
667 |
+
if text:
|
668 |
+
resume_texts.append(text)
|
669 |
+
file_names.append(os.path.basename(file_path))
|
670 |
+
processed_count += 1
|
671 |
+
|
672 |
+
# Apply memory optimization if enabled
|
673 |
+
if memory_optimization and j % gc_collect_interval == 0 and j > 0:
|
674 |
+
import gc
|
675 |
+
gc.collect()
|
676 |
+
except Exception as e:
|
677 |
+
st.warning(f"Error processing {file_path}: {str(e)}")
|
678 |
+
|
679 |
+
# Update progress
|
680 |
+
progress = min(1.0, (i + len(batch_files)) / total_files)
|
681 |
+
progress_bar.progress(progress)
|
682 |
+
status_text.text(f"Processed {processed_count}/{total_files} files...")
|
683 |
+
|
684 |
+
# Memory optimization
|
685 |
+
if memory_optimization:
|
686 |
+
import gc
|
687 |
+
gc.collect()
|
688 |
+
|
689 |
+
st.session_state.resumes_uploaded = True
|
690 |
+
st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
|
691 |
+
else:
|
692 |
+
# It's a CSV file
|
693 |
+
st.write(f"Selected CSV dataset: {dataset_path}")
|
694 |
+
|
695 |
+
try:
|
696 |
+
# Read the CSV file
|
697 |
+
df = pd.read_csv(dataset_path)
|
698 |
+
|
699 |
+
# Let user select which column contains the resume text
|
700 |
+
text_column = st.selectbox(
|
701 |
+
"Select column containing resume text:",
|
702 |
+
df.columns.tolist()
|
703 |
+
)
|
704 |
+
|
705 |
+
if st.button("Process Selected CSV"):
|
706 |
+
# Extract text from the selected column
|
707 |
+
for i, row in df.iterrows():
|
708 |
+
text = str(row[text_column])
|
709 |
+
if text and not pd.isna(text):
|
710 |
+
resume_texts.append(text)
|
711 |
+
# Use index as filename if no filename column
|
712 |
+
file_name = f"resume_{i}.txt"
|
713 |
+
if 'filename' in df.columns:
|
714 |
+
file_name = row['filename']
|
715 |
+
file_names.append(file_name)
|
716 |
+
|
717 |
+
st.session_state.resumes_uploaded = True
|
718 |
+
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
|
719 |
+
except Exception as e:
|
720 |
+
st.error(f"Error processing CSV: {str(e)}")
|
721 |
+
|
722 |
+
# Rest of the existing Upload from Dataset code
|
723 |
+
dataset_option = st.radio(
|
724 |
+
"Dataset source:",
|
725 |
+
["Upload CSV", "Use Hugging Face Dataset"]
|
726 |
+
)
|
727 |
+
|
728 |
+
if dataset_option == "Upload CSV":
|
729 |
+
csv_file = st.file_uploader(
|
730 |
+
"Upload CSV file containing resume data",
|
731 |
+
type=["csv"],
|
732 |
+
help="CSV should contain at least a column with resume text."
|
733 |
+
)
|
734 |
+
|
735 |
+
if csv_file:
|
736 |
+
with st.spinner("Processing CSV data..."):
|
737 |
+
# Read the CSV file
|
738 |
+
df = pd.read_csv(csv_file)
|
739 |
+
|
740 |
+
# Let user select which column contains the resume text
|
741 |
+
text_column = st.selectbox(
|
742 |
+
"Select column containing resume text:",
|
743 |
+
df.columns.tolist()
|
744 |
+
)
|
745 |
+
|
746 |
+
if st.button("Process Dataset"):
|
747 |
+
# Extract text from the selected column
|
748 |
+
for i, row in df.iterrows():
|
749 |
+
text = str(row[text_column])
|
750 |
+
if text and not pd.isna(text):
|
751 |
+
resume_texts.append(text)
|
752 |
+
# Use index as filename if no filename column
|
753 |
+
file_name = f"resume_{i}.txt"
|
754 |
+
if 'filename' in df.columns:
|
755 |
+
file_name = row['filename']
|
756 |
+
file_names.append(file_name)
|
757 |
+
|
758 |
+
st.session_state.resumes_uploaded = True
|
759 |
+
st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
|
760 |
+
else:
|
761 |
+
# Hugging Face Dataset option
|
762 |
+
dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
|
763 |
+
split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
|
764 |
+
|
765 |
+
if dataset_name and st.button("Load Dataset"):
|
766 |
+
with st.spinner(f"Loading dataset {dataset_name}..."):
|
767 |
+
try:
|
768 |
+
from datasets import load_dataset
|
769 |
+
|
770 |
+
# Load the dataset
|
771 |
+
dataset = load_dataset(dataset_name, split=split)
|
772 |
+
|
773 |
+
# Display dataset info
|
774 |
+
st.write(f"Dataset loaded with {len(dataset)} entries.")
|
775 |
+
|
776 |
+
# Let user select which column contains the resume text
|
777 |
+
if len(dataset.column_names) > 0:
|
778 |
+
text_column = st.selectbox(
|
779 |
+
"Select column containing resume text:",
|
780 |
+
dataset.column_names
|
781 |
+
)
|
782 |
+
|
783 |
+
if st.button("Process Hugging Face Dataset"):
|
784 |
+
# Extract text from the selected column
|
785 |
+
for i, item in enumerate(dataset):
|
786 |
+
if text_column in item:
|
787 |
+
text = str(item[text_column])
|
788 |
+
if text:
|
789 |
+
resume_texts.append(text)
|
790 |
+
# Use index or id field as filename
|
791 |
+
file_name = f"resume_{i}.txt"
|
792 |
+
if 'id' in item:
|
793 |
+
file_name = f"resume_{item['id']}.txt"
|
794 |
+
file_names.append(file_name)
|
795 |
+
|
796 |
+
st.session_state.resumes_uploaded = True
|
797 |
+
st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
|
798 |
+
except Exception as e:
|
799 |
+
st.error(f"Error loading dataset: {str(e)}")
|
800 |
+
st.info("Make sure you have the 'datasets' library installed: pip install datasets")
|
801 |
|
802 |
# Process button
|
803 |
if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
|
explanation_generator.py
CHANGED
@@ -34,14 +34,15 @@ class ExplanationGenerator:
|
|
34 |
device = "cpu"
|
35 |
|
36 |
# Load tokenizer
|
37 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
38 |
|
39 |
# Load model based on available resources
|
40 |
if device == "cuda":
|
41 |
self.model = AutoModelForCausalLM.from_pretrained(
|
42 |
self.model_name,
|
43 |
torch_dtype=torch.bfloat16,
|
44 |
-
device_map="auto"
|
|
|
45 |
)
|
46 |
else:
|
47 |
# Fall back to a simpler template-based solution if we can't load the model
|
|
|
34 |
device = "cpu"
|
35 |
|
36 |
# Load tokenizer
|
37 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
|
38 |
|
39 |
# Load model based on available resources
|
40 |
if device == "cuda":
|
41 |
self.model = AutoModelForCausalLM.from_pretrained(
|
42 |
self.model_name,
|
43 |
torch_dtype=torch.bfloat16,
|
44 |
+
device_map="auto",
|
45 |
+
trust_remote_code=True
|
46 |
)
|
47 |
else:
|
48 |
# Fall back to a simpler template-based solution if we can't load the model
|
requirements.txt
CHANGED
@@ -15,3 +15,4 @@ pandas==2.1.3
|
|
15 |
numpy==1.24.3
|
16 |
tqdm==4.66.1
|
17 |
huggingface-hub==0.25.0
|
|
|
|
15 |
numpy==1.24.3
|
16 |
tqdm==4.66.1
|
17 |
huggingface-hub==0.25.0
|
18 |
+
einops
|