File size: 5,305 Bytes
de16466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c431c8
 
 
de16466
 
8c431c8
 
 
de16466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c431c8
de16466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd800d2
de16466
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""Similarity_score_refined (2).ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ
"""

# !pip install sentence_transformers
# !pip install openai==0.28
# !pip install docx2txt PyPDF2 transformers

# from google.colab import drive,userdata
# drive.mount("/content/drive")
# print("Google Drive mounted.")

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure you have downloaded stopwords and wordnet
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def extract_text(file_path):
    import docx2txt
    if file_path.endswith(".docx"):
        # Extract text from DOCX file
        return docx2txt.process(file_path)

    elif file_path.endswith(".pdf"):
        # Extract text from PDF file
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
        return text

    else:
        raise ValueError("Unsupported file type")

def preprocess(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize the text by splitting on whitespace
    words = text.split()

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words (to get root form)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a single string
    return ' '.join(words)

def calculate_tfidf(doc):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc])  # Only fit on the individual document
    feature_names = vectorizer.get_feature_names_out()
    dense_tfidf_matrix = tfidf_matrix.todense()

    # Extract important terms from the document with a threshold
    important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2]

    return ' '.join(important_terms)

def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"):
    import openai
    openai.api_key = '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'
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature= 0,
        top_p=1,
        frequency_penalty= 0,
        presence_penalty= 0
    )
    return response['choices'][0]['message']['content'].strip()

def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    from sentence_transformers import SentenceTransformer, util
    model = SentenceTransformer(model_name)

    # Convert texts to embeddings
    embeddings1 = model.encode(resume, convert_to_tensor=True)
    embeddings2 = model.encode(job_desc, convert_to_tensor=True)

    # Calculate cosine similarity
    similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)
    return similarity_score.item()  # return as a scalar

def similarity_main(resume_path,job_description_path):

    # Extract text from files (replace with actual file paths)
    Resume_text = extract_text(resume_path)
    job_des = extract_text(job_description_path)
    api_key='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'


    prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
    resume_skills = call_chatgpt_api(prompt,api_key)
    experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}"
    resume_experience = call_chatgpt_api(experience_prompt,api_key)

    # Extract sections from job description (JD)
    jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}"
    jd_skills = call_chatgpt_api(jd_skills_prompt,api_key)

    jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}"
    jd_experience = call_chatgpt_api(jd_experience_prompt,api_key)

    resume_skills_clean = preprocess(resume_skills)
    jd_skills_clean = preprocess(jd_skills)

    resume_experience_clean = preprocess(resume_experience)
    jd_experience_clean = preprocess(jd_experience)

    filtered_resume = calculate_tfidf(resume_skills_clean)
    filtered_jd = calculate_tfidf(jd_skills_clean)
    similarity_skills=calculate_similarity(filtered_resume,filtered_jd)

    filtered_resume_ex = calculate_tfidf(resume_experience_clean)
    filtered_jd_ex = calculate_tfidf(jd_experience_clean)
    similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex)

    Average_Score=(similarity_skills+similarity_ex)/2
    percentage= f"{Average_Score * 100:.2f}%"
    return percentage