Spaces:

resumebuild
/

Anushka

Sleeping

App Files Files Community

Anushkabhat9 commited on Dec 6, 2024

Commit

ce00033

verified ·

1 Parent(s): 33d6177

Update similarity_score_refined.py

Browse files

Files changed (1) hide show

similarity_score_refined.py +114 -138

similarity_score_refined.py CHANGED Viewed

@@ -1,146 +1,122 @@
-# -*- coding: utf-8 -*-
-"""Similarity_score_refined (2).ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
-"""
-# !pip install sentence_transformers
-# !pip install openai==0.28
-# !pip install docx2txt PyPDF2 transformers
-# from google.colab import drive,userdata
-# drive.mount("/content/drive")
-# print("Google Drive mounted.")
-import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-import os
-# Ensure you have downloaded stopwords and wordnet
-import nltk
-nltk.download('stopwords')
-nltk.download('wordnet')
-def extract_text(file_path):
-    import docx2txt
-    import PyPDF2
-    if file_path.endswith(".docx"):
-        # Extract text from DOCX file
-        return docx2txt.process(file_path)
-    elif file_path.endswith(".pdf"):
-        # Extract text from PDF file
-        text = ""
-        with open(file_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
             for page_num in range(len(reader.pages)):
-                text += reader.pages[page_num].extract_text()
-        return text
-    else:
-        raise ValueError("Unsupported file type")
-def preprocess(text):
-    # Lowercase the text
-    text = text.lower()
-    # Remove special characters and numbers
-    text = re.sub(r'[^a-z\s]', '', text)
-    # Tokenize the text by splitting on whitespace
-    words = text.split()
-    # Remove stop words
-    stop_words = set(stopwords.words('english'))
-    words = [word for word in words if word not in stop_words]
-    # Lemmatize the words (to get root form)
-    lemmatizer = WordNetLemmatizer()
-    words = [lemmatizer.lemmatize(word) for word in words]
-    # Join words back into a single string
-    return ' '.join(words)
-def calculate_tfidf(doc):
-    vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform([doc])  # Only fit on the individual document
-    feature_names = vectorizer.get_feature_names_out()
-    dense_tfidf_matrix = tfidf_matrix.todense()
-    # Extract important terms from the document with a threshold
-    important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]
-    return ' '.join(important_terms)
-def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
-    import openai
-    openai.api_key = api_key
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt}
-        ],
-        max_tokens=500,
-        temperature= 0,
-        top_p=1,
-        frequency_penalty= 0,
-        presence_penalty= 0
-    )
-    return response['choices'][0]['message']['content'].strip()
-def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
-    from sentence_transformers import SentenceTransformer, util
-    model = SentenceTransformer(model_name)
-    # Convert texts to embeddings
-    embeddings1 = model.encode(resume, convert_to_tensor=True)
-    embeddings2 = model.encode(job_desc, convert_to_tensor=True)
-    # Calculate cosine similarity
-    similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
-    return similarity_score.item()  # return as a scalar
-def similarity_main(resume_path,job_description_path):
-    # Extract text from files (replace with actual file paths)
-    Resume_text = extract_text(resume_path)
-    job_des = extract_text(job_description_path)
-    api_key=os.environ.get('OPENAI_KEY')
-    prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
-    resume_skills = call_chatgpt_api(prompt,api_key)
-    experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
-    resume_experience = call_chatgpt_api(experience_prompt,api_key)
-    # Extract sections from job description (JD)
-    jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
-    jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)
-    jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
-    jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)
-    resume_skills_clean = preprocess(resume_skills)
-    jd_skills_clean = preprocess(jd_skills)
-    resume_experience_clean = preprocess(resume_experience)
-    jd_experience_clean = preprocess(jd_experience)
-    filtered_resume = calculate_tfidf(resume_skills_clean)
-    filtered_jd = calculate_tfidf(jd_skills_clean)
-    similarity_skills=calculate_similarity(filtered_resume,filtered_jd)
-    filtered_resume_ex = calculate_tfidf(resume_experience_clean)
-    filtered_jd_ex = calculate_tfidf(jd_experience_clean)
-    similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)
-    Average_Score=(similarity_skills+similarity_ex)/2
-    percentage= f"{Average_Score * 100:.2f}%"
-    return percentage

+from docx.opc.exceptions import PackageNotFoundError
+def read_file(file_path):
+    """
+    Reads the content of a file. If the file is a PDF, it extracts the text using PyPDF2.
+    If the file is a docx, it extracts the text using python-docx.
+    Otherwise, it reads the file as a text file, trying different encodings if 'utf-8' fails.
+    """
+    # Check if the file exists before proceeding
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+    if file_path.lower().endswith('.pdf'):
+        with open(file_path, 'rb') as file:  # Open in binary read mode for PDFs
             reader = PyPDF2.PdfReader(file)
+            text = ""
             for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+                text += page.extract_text()
+            return text
+    elif file_path.lower().endswith('.docx'):
+        # Handle docx files using python-docx
+        try:
+            doc = Document(file_path)
+            text = ""
+            for paragraph in doc.paragraphs:
+                text += paragraph.text + "\n"  # Add newline for paragraph separation
+            return text
+        # Use the imported exception class
+        except PackageNotFoundError:
+            # Provide a more informative error message if the file is not a valid docx
+            raise PackageNotFoundError(f"The file {file_path} is not a valid docx file. It may be corrupted or of a different format.")
+import os
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Resume/firm-capsule-436804-b5-5f553d9f1043.json"
+import os
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_community.vectorstores.faiss import FAISS
+from google.colab import drive
+from docx import Document
+import google.generativeai as genai
+from datetime import datetime
+import PyPDF2
+api_key_google = userdata.get('google_cloud')
+genai.configure(api_key=api_key_google)
+# Mount Google Drive
+drive.mount('/content/drive')
+model = genai.GenerativeModel('gemini-pro')
+def check_relevance_gemini(tailored_resume, job_description):
+    """
+    Use Gemini Pro to evaluate the relevance score between a tailored resume and job description.
+    Args:
+    - tailored_resume (str): Tailored resume content.
+    - job_description (str): Job description content.
+    Returns:
+    - dict: A dictionary containing the 'score' and 'reason'.
+    """
+    prompt = f"""
+You are a recruitment expert evaluating how well a tailored resume aligns with a job description. Provide a realistic and concise evaluation based on the following criteria:
+1. Relevance of skills and experience: Do the candidate’s skills, accomplishments, and experience meet the job's core requirements?
+2. Domain Match: Are the candidate's experiences and achievements relevant to the industry or role?
+3. Clarity and Conciseness: Is the resume well-structured and focused on the job requirements?
+4. Highlight any gaps or mismatched qualifications realistically.
+Provide your response in this exact format:
+Score: [Score between 0 and 1]
+Reason: [One or two sentences explaining the score]
+Here is the tailored resume:
+[Resume Start]
+{tailored_resume}
+[Resume End]
+And the job description below:
+[Job Description Start]
+{job_description}
+[Job Description End]
+"""
+    try:
+        # Get the response from Gemini Pro
+        response = model.generate_content(prompt)
+        candidates = response.candidates
+        if not candidates or len(candidates) == 0:
+            raise ValueError("No candidates found in the response.")
+        # Extract content text
+        content_text = candidates[0].content.parts[0].text
+        # Extract score and reason with simple parsing
+        lines = content_text.split("\n")
+        score = None
+        reason = None
+        print(content_text)
+        for line in lines:
+            if line.lower().startswith("score:"):
+                try:
+                    score = float(line.split(":", 1)[1].strip())
+                except ValueError:
+                    raise ValueError(f"Invalid score format: {line}")
+            elif line.lower().startswith("reason:"):
+                reason = line.split(":", 1)[1].strip()
+        # Ensure both score and reason are extracted
+        if score is None:
+            raise ValueError("Failed to extract score from the response.")
+        if not reason:
+            reason = "No reason provided."
+        return {"score": score, "reason": reason}
+    except Exception as e:
+        print(f"Error in relevance checking: {e}")
+        return None