root commited on
Commit
53cdf96
·
1 Parent(s): 9879a34
Files changed (3) hide show
  1. app.py +318 -13
  2. explanation_generator.py +3 -2
  3. requirements.txt +1 -0
app.py CHANGED
@@ -63,6 +63,19 @@ with st.sidebar:
63
  use_explanation = st.checkbox("Generate Explanations", value=True)
64
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  st.markdown("---")
67
  st.markdown("### About")
68
  st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
@@ -102,8 +115,8 @@ class ResumeScreener:
102
  if "sentence-transformers" in self.embedding_model_name:
103
  self.model = SentenceTransformer(self.embedding_model_name)
104
  else:
105
- self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name)
106
- self.model = AutoModel.from_pretrained(self.embedding_model_name)
107
 
108
  st.session_state.embedding_model = self.model
109
  st.session_state.tokenizer = self.tokenizer
@@ -430,6 +443,35 @@ def get_csv_download_link(df, filename="results.csv"):
430
  href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
431
  return href
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  # Main app UI
434
  st.title("Resume Screener & Skill Extractor")
435
  st.markdown("---")
@@ -449,7 +491,7 @@ job_description = st.text_area(
449
  st.header("2. Upload Resumes")
450
  upload_option = st.radio(
451
  "Choose upload method:",
452
- ["Upload Files", "Upload from Dataset"]
453
  )
454
 
455
  uploaded_files = []
@@ -483,16 +525,279 @@ if upload_option == "Upload Files":
483
 
484
  st.session_state.resumes_uploaded = True
485
  st.success(f"Successfully processed {len(resume_texts)} resumes.")
486
- else:
487
- st.write("Upload from dataset feature will be implemented soon.")
488
- # Here you would implement the connection to Hugging Face datasets
489
- # Example pseudocode:
490
- # dataset_name = st.text_input("Enter Hugging Face dataset name:")
491
- # if st.button("Load Dataset"):
492
- # with st.spinner("Loading dataset..."):
493
- # dataset = load_dataset(dataset_name)
494
- # resume_texts = [item["text"] for item in dataset]
495
- # file_names = [f"resume_{i}.txt" for i in range(len(resume_texts))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
  # Process button
498
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
 
63
  use_explanation = st.checkbox("Generate Explanations", value=True)
64
  use_faiss = st.checkbox("Use FAISS for fast search", value=True)
65
 
66
+ # Memory optimization options
67
+ st.subheader("Memory Optimization")
68
+ memory_optimization = st.checkbox("Enable memory optimization (for large datasets)", value=False)
69
+ clear_embeddings = st.checkbox("Clear embeddings after processing", value=False)
70
+ gc_collect_interval = st.number_input(
71
+ "Garbage collection interval (files)",
72
+ min_value=10,
73
+ max_value=1000,
74
+ value=100,
75
+ step=10,
76
+ help="Run garbage collection after processing this many files"
77
+ )
78
+
79
  st.markdown("---")
80
  st.markdown("### About")
81
  st.markdown("This app uses a hybrid ranking system combining semantic similarity with keyword matching to find the most suitable resumes for a job position.")
 
115
  if "sentence-transformers" in self.embedding_model_name:
116
  self.model = SentenceTransformer(self.embedding_model_name)
117
  else:
118
+ self.tokenizer = AutoTokenizer.from_pretrained(self.embedding_model_name, trust_remote_code=True)
119
+ self.model = AutoModel.from_pretrained(self.embedding_model_name, trust_remote_code=True)
120
 
121
  st.session_state.embedding_model = self.model
122
  st.session_state.tokenizer = self.tokenizer
 
443
  href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>'
444
  return href
445
 
446
+ # Add this new function after the get_csv_download_link function
447
+ def get_huggingface_spaces_datasets():
448
+ """Check for datasets in Hugging Face Spaces environment"""
449
+ datasets = []
450
+
451
+ # Common dataset paths in Hugging Face Spaces
452
+ potential_paths = [
453
+ "/data", # Common mount point
454
+ "data", # Relative path
455
+ os.path.expanduser("~/data"), # Home directory
456
+ ]
457
+
458
+ for path in potential_paths:
459
+ if os.path.exists(path) and os.path.isdir(path):
460
+ # Look for CSV files
461
+ csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
462
+ for csv_file in csv_files:
463
+ datasets.append(os.path.join(path, csv_file))
464
+
465
+ # Look for directories that might contain PDFs
466
+ for subdir in os.listdir(path):
467
+ subdir_path = os.path.join(path, subdir)
468
+ if os.path.isdir(subdir_path):
469
+ pdf_count = len([f for f in os.listdir(subdir_path) if f.lower().endswith('.pdf')])
470
+ if pdf_count > 0:
471
+ datasets.append((subdir_path, f"PDF Directory ({pdf_count} files)"))
472
+
473
+ return datasets
474
+
475
  # Main app UI
476
  st.title("Resume Screener & Skill Extractor")
477
  st.markdown("---")
 
491
  st.header("2. Upload Resumes")
492
  upload_option = st.radio(
493
  "Choose upload method:",
494
+ ["Upload Files", "Upload from Dataset", "Process Directory"]
495
  )
496
 
497
  uploaded_files = []
 
525
 
526
  st.session_state.resumes_uploaded = True
527
  st.success(f"Successfully processed {len(resume_texts)} resumes.")
528
+ elif upload_option == "Process Directory":
529
+ st.write("Process resume files from a directory on the server.")
530
+
531
+ # Input for directory path
532
+ resume_dir = st.text_input(
533
+ "Enter the path to the directory containing resume files:",
534
+ help="For Hugging Face Spaces, this could be a mounted directory or dataset."
535
+ )
536
+
537
+ # Limit batch size
538
+ batch_size = st.number_input(
539
+ "Number of files to process per batch (lower for less memory usage):",
540
+ min_value=10,
541
+ max_value=1000,
542
+ value=100,
543
+ step=10
544
+ )
545
+
546
+ # File types to process
547
+ file_types = st.multiselect(
548
+ "Select file types to process:",
549
+ ["pdf", "docx", "txt", "csv"],
550
+ default=["pdf"]
551
+ )
552
+
553
+ if resume_dir and st.button("Process Directory"):
554
+ if os.path.isdir(resume_dir):
555
+ # Get all files matching the selected types
556
+ all_files = []
557
+ for file_type in file_types:
558
+ all_files.extend([
559
+ os.path.join(resume_dir, f)
560
+ for f in os.listdir(resume_dir)
561
+ if f.lower().endswith(f'.{file_type}')
562
+ ])
563
+
564
+ if all_files:
565
+ total_files = len(all_files)
566
+ st.write(f"Found {total_files} files. Processing in batches of {batch_size}...")
567
+
568
+ # Process in batches
569
+ processed_count = 0
570
+ progress_bar = st.progress(0)
571
+ status_text = st.empty()
572
+
573
+ for i in range(0, total_files, batch_size):
574
+ batch_files = all_files[i:i+batch_size]
575
+
576
+ for j, file_path in enumerate(batch_files):
577
+ try:
578
+ file_type = file_path.split('.')[-1].lower()
579
+ text = screener.extract_text_from_file(file_path, file_type)
580
+ if text:
581
+ resume_texts.append(text)
582
+ file_names.append(os.path.basename(file_path))
583
+ processed_count += 1
584
+
585
+ # Apply memory optimization if enabled
586
+ if memory_optimization and j % gc_collect_interval == 0 and j > 0:
587
+ import gc
588
+ gc.collect()
589
+ status_text.text(f"Processed {processed_count}/{total_files} files... (ran GC)")
590
+ except Exception as e:
591
+ st.warning(f"Error processing {file_path}: {str(e)}")
592
+
593
+ # Update progress
594
+ progress = min(1.0, (i + len(batch_files)) / total_files)
595
+ progress_bar.progress(progress)
596
+ status_text.text(f"Processed {processed_count}/{total_files} files...")
597
+
598
+ # Run garbage collection between batches if memory optimization is enabled
599
+ if memory_optimization:
600
+ import gc
601
+ gc.collect()
602
+
603
+ # Final garbage collection if memory optimization is enabled
604
+ if memory_optimization:
605
+ import gc
606
+ gc.collect()
607
+
608
+ st.session_state.resumes_uploaded = True
609
+ st.success(f"Successfully processed {processed_count} out of {total_files} resume files.")
610
+ else:
611
+ st.error(f"No matching files found in {resume_dir}")
612
+ else:
613
+ st.error(f"Directory {resume_dir} does not exist or is not accessible.")
614
+ elif upload_option == "Upload from Dataset":
615
+ # Upload from Dataset implementation
616
+ st.write("Upload a CSV file containing resume data or load from available datasets.")
617
+
618
+ # Check for available datasets in Hugging Face Spaces
619
+ hf_datasets = get_huggingface_spaces_datasets()
620
+
621
+ if hf_datasets:
622
+ st.subheader("Available Datasets in Hugging Face Spaces")
623
+ dataset_options = ["None"] + [os.path.basename(ds) if isinstance(ds, str) else f"{os.path.basename(ds[0])} ({ds[1]})" for ds in hf_datasets]
624
+ selected_dataset = st.selectbox("Select a dataset:", dataset_options)
625
+
626
+ if selected_dataset != "None":
627
+ selected_index = dataset_options.index(selected_dataset) - 1 # Adjust for "None"
628
+ dataset_path = hf_datasets[selected_index]
629
+
630
+ if isinstance(dataset_path, tuple):
631
+ # It's a PDF directory
632
+ pdf_dir = dataset_path[0]
633
+ st.write(f"Selected PDF directory: {pdf_dir}")
634
+
635
+ batch_size = st.number_input(
636
+ "Number of files to process per batch:",
637
+ min_value=10,
638
+ max_value=1000,
639
+ value=100,
640
+ step=10
641
+ )
642
+
643
+ if st.button("Process PDF Directory"):
644
+ # Use the same processing logic as in the "Process Directory" option
645
+ if os.path.isdir(pdf_dir):
646
+ all_files = [
647
+ os.path.join(pdf_dir, f)
648
+ for f in os.listdir(pdf_dir)
649
+ if f.lower().endswith('.pdf')
650
+ ]
651
+
652
+ if all_files:
653
+ total_files = len(all_files)
654
+ st.write(f"Found {total_files} PDF files. Processing in batches of {batch_size}...")
655
+
656
+ # Process in batches
657
+ processed_count = 0
658
+ progress_bar = st.progress(0)
659
+ status_text = st.empty()
660
+
661
+ for i in range(0, total_files, batch_size):
662
+ batch_files = all_files[i:i+batch_size]
663
+
664
+ for j, file_path in enumerate(batch_files):
665
+ try:
666
+ text = screener.extract_text_from_file(file_path, "pdf")
667
+ if text:
668
+ resume_texts.append(text)
669
+ file_names.append(os.path.basename(file_path))
670
+ processed_count += 1
671
+
672
+ # Apply memory optimization if enabled
673
+ if memory_optimization and j % gc_collect_interval == 0 and j > 0:
674
+ import gc
675
+ gc.collect()
676
+ except Exception as e:
677
+ st.warning(f"Error processing {file_path}: {str(e)}")
678
+
679
+ # Update progress
680
+ progress = min(1.0, (i + len(batch_files)) / total_files)
681
+ progress_bar.progress(progress)
682
+ status_text.text(f"Processed {processed_count}/{total_files} files...")
683
+
684
+ # Memory optimization
685
+ if memory_optimization:
686
+ import gc
687
+ gc.collect()
688
+
689
+ st.session_state.resumes_uploaded = True
690
+ st.success(f"Successfully processed {processed_count} out of {total_files} PDF files.")
691
+ else:
692
+ # It's a CSV file
693
+ st.write(f"Selected CSV dataset: {dataset_path}")
694
+
695
+ try:
696
+ # Read the CSV file
697
+ df = pd.read_csv(dataset_path)
698
+
699
+ # Let user select which column contains the resume text
700
+ text_column = st.selectbox(
701
+ "Select column containing resume text:",
702
+ df.columns.tolist()
703
+ )
704
+
705
+ if st.button("Process Selected CSV"):
706
+ # Extract text from the selected column
707
+ for i, row in df.iterrows():
708
+ text = str(row[text_column])
709
+ if text and not pd.isna(text):
710
+ resume_texts.append(text)
711
+ # Use index as filename if no filename column
712
+ file_name = f"resume_{i}.txt"
713
+ if 'filename' in df.columns:
714
+ file_name = row['filename']
715
+ file_names.append(file_name)
716
+
717
+ st.session_state.resumes_uploaded = True
718
+ st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
719
+ except Exception as e:
720
+ st.error(f"Error processing CSV: {str(e)}")
721
+
722
+ # Rest of the existing Upload from Dataset code
723
+ dataset_option = st.radio(
724
+ "Dataset source:",
725
+ ["Upload CSV", "Use Hugging Face Dataset"]
726
+ )
727
+
728
+ if dataset_option == "Upload CSV":
729
+ csv_file = st.file_uploader(
730
+ "Upload CSV file containing resume data",
731
+ type=["csv"],
732
+ help="CSV should contain at least a column with resume text."
733
+ )
734
+
735
+ if csv_file:
736
+ with st.spinner("Processing CSV data..."):
737
+ # Read the CSV file
738
+ df = pd.read_csv(csv_file)
739
+
740
+ # Let user select which column contains the resume text
741
+ text_column = st.selectbox(
742
+ "Select column containing resume text:",
743
+ df.columns.tolist()
744
+ )
745
+
746
+ if st.button("Process Dataset"):
747
+ # Extract text from the selected column
748
+ for i, row in df.iterrows():
749
+ text = str(row[text_column])
750
+ if text and not pd.isna(text):
751
+ resume_texts.append(text)
752
+ # Use index as filename if no filename column
753
+ file_name = f"resume_{i}.txt"
754
+ if 'filename' in df.columns:
755
+ file_name = row['filename']
756
+ file_names.append(file_name)
757
+
758
+ st.session_state.resumes_uploaded = True
759
+ st.success(f"Successfully processed {len(resume_texts)} resumes from CSV.")
760
+ else:
761
+ # Hugging Face Dataset option
762
+ dataset_name = st.text_input("Enter Hugging Face dataset name (e.g., 'user/resume_dataset'):")
763
+ split = st.text_input("Enter dataset split (e.g., 'train'):", "train")
764
+
765
+ if dataset_name and st.button("Load Dataset"):
766
+ with st.spinner(f"Loading dataset {dataset_name}..."):
767
+ try:
768
+ from datasets import load_dataset
769
+
770
+ # Load the dataset
771
+ dataset = load_dataset(dataset_name, split=split)
772
+
773
+ # Display dataset info
774
+ st.write(f"Dataset loaded with {len(dataset)} entries.")
775
+
776
+ # Let user select which column contains the resume text
777
+ if len(dataset.column_names) > 0:
778
+ text_column = st.selectbox(
779
+ "Select column containing resume text:",
780
+ dataset.column_names
781
+ )
782
+
783
+ if st.button("Process Hugging Face Dataset"):
784
+ # Extract text from the selected column
785
+ for i, item in enumerate(dataset):
786
+ if text_column in item:
787
+ text = str(item[text_column])
788
+ if text:
789
+ resume_texts.append(text)
790
+ # Use index or id field as filename
791
+ file_name = f"resume_{i}.txt"
792
+ if 'id' in item:
793
+ file_name = f"resume_{item['id']}.txt"
794
+ file_names.append(file_name)
795
+
796
+ st.session_state.resumes_uploaded = True
797
+ st.success(f"Successfully processed {len(resume_texts)} resumes from Hugging Face dataset.")
798
+ except Exception as e:
799
+ st.error(f"Error loading dataset: {str(e)}")
800
+ st.info("Make sure you have the 'datasets' library installed: pip install datasets")
801
 
802
  # Process button
803
  if st.button("Find Top Candidates", disabled=not (job_description and resume_texts)):
explanation_generator.py CHANGED
@@ -34,14 +34,15 @@ class ExplanationGenerator:
34
  device = "cpu"
35
 
36
  # Load tokenizer
37
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
38
 
39
  # Load model based on available resources
40
  if device == "cuda":
41
  self.model = AutoModelForCausalLM.from_pretrained(
42
  self.model_name,
43
  torch_dtype=torch.bfloat16,
44
- device_map="auto"
 
45
  )
46
  else:
47
  # Fall back to a simpler template-based solution if we can't load the model
 
34
  device = "cpu"
35
 
36
  # Load tokenizer
37
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
38
 
39
  # Load model based on available resources
40
  if device == "cuda":
41
  self.model = AutoModelForCausalLM.from_pretrained(
42
  self.model_name,
43
  torch_dtype=torch.bfloat16,
44
+ device_map="auto",
45
+ trust_remote_code=True
46
  )
47
  else:
48
  # Fall back to a simpler template-based solution if we can't load the model
requirements.txt CHANGED
@@ -15,3 +15,4 @@ pandas==2.1.3
15
  numpy==1.24.3
16
  tqdm==4.66.1
17
  huggingface-hub==0.25.0
 
 
15
  numpy==1.24.3
16
  tqdm==4.66.1
17
  huggingface-hub==0.25.0
18
+ einops