Spaces:

Prernas19
/

resume_analysis

Sleeping

App Files Files Community

Prernas19 commited on Jul 26, 2024

Commit

9e79309

verified ·

1 Parent(s): adcf648

Create app.py

Browse files

Files changed (1) hide show

app.py +200 -0

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+import re
+import fitz  # Importing PyMuPDF for PDF text extraction
+import nltk
+from transformers import BertTokenizer, BertModel
+import torch
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import gradio as gr
+# Download NLTK data files
+nltk.download('punkt')
+nltk.download('stopwords')
+# Load BERT model and tokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained('bert-base-uncased')
+# Function to preprocess text
+def preprocess_text(text):
+    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-alphanumeric characters and lower case
+    return text
+# Function to extract keywords using TF-IDF
+def extract_keywords_tfidf(text, max_features=50):
+    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
+    tfidf_matrix = vectorizer.fit_transform([text])
+    feature_names = vectorizer.get_feature_names_out()
+    tfidf_scores = tfidf_matrix.toarray().flatten()
+    keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True)
+    return [keyword for score, keyword in keyword_scores]
+# Function to extract text from a PDF
+def extract_text_from_pdf(pdf_path):
+    document = fitz.open(pdf_path)
+    text = ""
+    for page_num in range(len(document)):
+        page = document.load_page(page_num)
+        text += page.get_text()
+    return text
+# Function to give feedback on resume
+def give_feedback(resume_text, job_description):
+    feedback = []
+    # Check formatting (example: consistency in bullet points)
+    if '•' in resume_text and '-' in resume_text:
+        feedback.append("Consider using a consistent bullet point style throughout your resume.")
+    # Check for grammar and spelling
+    if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())):
+        feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.")
+    # Extract keywords and check relevance
+    jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
+    resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text))
+    common_keywords = set(jd_keywords).intersection(set(resume_keywords))
+    if len(common_keywords) < 8:
+        feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.")
+    # Check for action verbs
+    action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"]
+    if not any(verb in resume_text.lower() for verb in action_verbs):
+        feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.")
+    if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE):
+        feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.")
+    # Check for quantifiable achievements
+    if not re.findall(r'\d+', resume_text):
+        feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).")
+    # Provide positive feedback if none of the above conditions are met
+    if not feedback:
+        feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.")
+    return feedback
+# Function to compute BERT embeddings
+def get_bert_embeddings(text):
+    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
+    with torch.no_grad():
+        outputs = model(**tokens)
+    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+# Function to calculate BERT similarity score
+def bert_similarity(resume, job_description):
+    resume_embedding = get_bert_embeddings(resume)
+    job_description_embedding = get_bert_embeddings(job_description)
+    cosine_sim = cosine_similarity([resume_embedding], [job_description_embedding])
+    return cosine_sim[0][0]
+# Function to calculate TF-IDF cosine similarity score
+def tfidf_cosine_similarity(resume, jd):
+    documents = [resume, jd]
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(documents)
+    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
+    return cosine_sim[0][0]
+# Function to calculate Doc2Vec cosine similarity score
+def doc2vec_cosine_similarity(resume, jd, model):
+    resume_vector = model.infer_vector(resume.split())
+    jd_vector = model.infer_vector(jd.split())
+    cosine_sim = cosine_similarity([resume_vector], [jd_vector])
+    return cosine_sim[0][0]
+# Function to extract years of experience from resume
+def extract_years_of_experience(text):
+    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
+    if years:
+        return sum(map(int, years))
+    return 0
+# Function to extract information from resumes in a folder
+def extract_info_from_resumes(resume_files, job_description):
+    data = []
+    # Train Doc2Vec model on resumes and job description
+    documents = []
+    for file in resume_files:
+        text = extract_text_from_pdf(file.name)
+        documents.append(preprocess_text(text))
+    documents.append(preprocess_text(job_description))
+    tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
+    model_doc2vec = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
+    for file in resume_files:
+        text = extract_text_from_pdf(file.name)
+        preprocessed_text = preprocess_text(text)
+        resume_keywords = extract_keywords_tfidf(preprocessed_text)
+        years_of_experience = extract_years_of_experience(text)
+        # Append years of experience to the resume keywords
+        if years_of_experience > 0:
+            resume_keywords.append(f"{years_of_experience} years experience")
+        name = os.path.splitext(os.path.basename(file.name))[0]
+        feedback = give_feedback(text, job_description)
+        # Calculate scores
+        jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
+        common_keywords = set(jd_keywords).intersection(set(resume_keywords))
+        keyword_match_score = len(common_keywords)  # Count of common keywords as a whole number
+        tfidf_score = tfidf_cosine_similarity(text, job_description)
+        doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model_doc2vec)
+        bert_score = bert_similarity(preprocessed_text, preprocess_text(job_description))
+        data.append({
+            'Name': name,
+            'Keyword_Match_Score': keyword_match_score,  # Whole number
+            'TFIDF_Score': tfidf_score,
+            'Doc2Vec_Score': doc2vec_score,
+            'BERT_Score': bert_score,  # Add BERT score
+            'Years_of_Experience': years_of_experience,
+            'Feedback': '; '.join(feedback),  # Combine feedback into a single string
+        })
+    return data
+# Function to save data to an Excel file
+def save_to_excel(data, output_file):
+    df = pd.DataFrame(data)
+    try:
+        df.to_excel(output_file, index=False)
+        return output_file
+    except Exception as e:
+        return f"Error saving file: {e}"
+# Gradio interface function
+def gradio_interface(resume_files, job_description):
+    if resume_files:
+        output_file = '/content/Resume_Analysis.xlsx'
+        resumes = extract_info_from_resumes(resume_files, job_description)
+        result = save_to_excel(resumes, output_file)
+    else:
+        result = "No resumes to process."
+    return result
+# Gradio UI setup
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Files(label="Upload multiple Resumes", type="filepath"),  # Accept multiple file uploads
+        gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...")
+    ],
+    outputs=gr.File(label="Download Results"),  # Provide the output file
+    description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results."
+)
+# Launch the Gradio interface
+iface.launch(inline = False)