# -*- coding: utf-8 -*- """Similarity_score_refined (2).ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1c8mlCBnLbduLsI8rUGFEOYDuyBqdz2JJ """ # !pip install sentence_transformers # !pip install openai==0.28 # !pip install docx2txt PyPDF2 transformers # from google.colab import drive,userdata # drive.mount("/content/drive") # print("Google Drive mounted.") import re from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import os # Ensure you have downloaded stopwords and wordnet import nltk nltk.download('stopwords') nltk.download('wordnet') def extract_text(file_path): import docx2txt import PyPDF2 if file_path.endswith(".docx"): # Extract text from DOCX file return docx2txt.process(file_path) elif file_path.endswith(".pdf"): # Extract text from PDF file text = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) for page_num in range(len(reader.pages)): text += reader.pages[page_num].extract_text() return text else: raise ValueError("Unsupported file type") def preprocess(text): # Lowercase the text text = text.lower() # Remove special characters and numbers text = re.sub(r'[^a-z\s]', '', text) # Tokenize the text by splitting on whitespace words = text.split() # Remove stop words stop_words = set(stopwords.words('english')) words = [word for word in words if word not in stop_words] # Lemmatize the words (to get root form) lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word) for word in words] # Join words back into a single string return ' '.join(words) def calculate_tfidf(doc): vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform([doc]) # Only fit on the individual document feature_names = vectorizer.get_feature_names_out() dense_tfidf_matrix = tfidf_matrix.todense() # Extract important terms from the document with a threshold important_terms = [feature_names[i] for i in range(len(feature_names)) if dense_tfidf_matrix[0, i] > 0.2] return ' '.join(important_terms) def call_chatgpt_api(prompt, api_key,model="gpt-3.5-turbo"): import openai openai.api_key = api_key response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ], max_tokens=500, temperature= 0, top_p=1, frequency_penalty= 0, presence_penalty= 0 ) return response['choices'][0]['message']['content'].strip() def calculate_similarity(resume, job_desc, model_name="sentence-transformers/all-MiniLM-L6-v2"): from sentence_transformers import SentenceTransformer, util model = SentenceTransformer(model_name) # Convert texts to embeddings embeddings1 = model.encode(resume, convert_to_tensor=True) embeddings2 = model.encode(job_desc, convert_to_tensor=True) # Calculate cosine similarity similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2) return similarity_score.item() # return as a scalar def similarity_main(resume_path,job_description_path): # Extract text from files (replace with actual file paths) Resume_text = extract_text(resume_path) job_des = extract_text(job_description_path) api_key=os.environ.get('OPENAI_KEY') prompt=f"Extract the skills or competencies section from the resume. Avoid using name of the candidate:\n\n{Resume_text}" resume_skills = call_chatgpt_api(prompt,api_key) experience_prompt = f"Extract the experience of the candidate from the resume. Avoid using name of the candidate:\n\n{Resume_text}" resume_experience = call_chatgpt_api(experience_prompt,api_key) # Extract sections from job description (JD) jd_skills_prompt = f"Extract the skills section from the job description:\n\n{job_des}" jd_skills = call_chatgpt_api(jd_skills_prompt,api_key) jd_experience_prompt = f"Extract the experience section from the job description:\n\n{job_des}" jd_experience = call_chatgpt_api(jd_experience_prompt,api_key) resume_skills_clean = preprocess(resume_skills) jd_skills_clean = preprocess(jd_skills) resume_experience_clean = preprocess(resume_experience) jd_experience_clean = preprocess(jd_experience) filtered_resume = calculate_tfidf(resume_skills_clean) filtered_jd = calculate_tfidf(jd_skills_clean) similarity_skills=calculate_similarity(filtered_resume,filtered_jd) filtered_resume_ex = calculate_tfidf(resume_experience_clean) filtered_jd_ex = calculate_tfidf(jd_experience_clean) similarity_ex=calculate_similarity(filtered_resume_ex,filtered_jd_ex) Average_Score=(similarity_skills+similarity_ex)/2 percentage= f"{Average_Score * 100:.2f}%" return percentage