import pandas as pd import streamlit as st import pandas as pd from functions import * backgroundPattern = """ """ st.markdown(backgroundPattern, unsafe_allow_html=True) st.write(""" # Resume Screening & Classification """) st.header('Input') jobs_data= job_desc_pdf() resume_data= resume_pdf() # setup_nltk_resources() # # Unzip wordnet # corpora_path = "/kaggle/working/nltk_data/corpora" # wordnet_zip = os.path.join(corpora_path, "wordnet.zip") # unzip_nltk_resource(wordnet_zip, corpora_path) # Apply preprocessing jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text) jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description') resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text) resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume') jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description') resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume') # Dropping unnecessary columns from jobs data jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']] # Dropping unnecessary columns from resume data resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']] summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo") # Summarize the top 100 'processed_description' of jobs_data_final top_jobs_data = jobs_data_final.head(100) # Summariz jobs description jobs_data_summarized = batch_summarize(top_jobs_data, 'processed_description', summarizer, batch_size=10, output_col='summarized_description') # Summarize all 'processed_resume' in resume_data_final resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume')