import os |
import tempfile |
import fitz |
from sklearn.metrics.pairwise import cosine_similarity |
import numpy as np |
import pandas as pd |
from sklearn.feature_extraction.text import TfidfVectorizer |
from dotenv import load_dotenv |
import google.generativeai as genai |
load_dotenv() |
genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) |
def extract_text_from_pdf(pdf_content): |
""" |
Extracts text content from a PDF file. |
Parameters: |
- pdf_content (bytes): Bytes-like object containing the content of the PDF file. |
Returns: |
- str: Extracted text content from the PDF file. |
""" |
text = '' |
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
temp_file.write(pdf_content) |
temp_path = temp_file.name |
pdf_document = fitz.open(temp_path) |
for page_number in range(pdf_document.page_count): |
page = pdf_document[page_number] |
text += page.get_text() |
pdf_document.close() |
os.remove(temp_path) |
return str(text.replace("\xa0", "")) |
def generate_gemini_content(transcript_text): |
""" |
Generates a summary based on the input text using Google's Gemini Pro model. |
Parameters: |
- transcript_text (str): Text to be summarized. |
Returns: |
- str: Generated summary. |
""" |
prompt = """ |
Instructions: |
Please provide a concise summary of your relevant experience, skills, |
and qualifications in the field of programming and technology. |
Highlight your practical experience, technological proficiencies, technical skills, soft skills, |
proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields. |
Additionally, include your location of residence and any other relevant details related to the programming industry |
to facilitate accurate matching with job descriptions. |
Example summary: |
"Experienced software engineer with proficiency in Python, JavaScript, and Java. |
Skilled in developing web applications using React.js and Django frameworks. |
Strong problem-solving and communication skills. Located in New York City, |
seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects." |
CV is : |
""" |
model = genai.GenerativeModel("gemini-pro") |
response = model.generate_content(prompt + transcript_text) |
return response.text |
def git_indices(data, cv_vect, df_vect): |
""" |
Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions. |
Parameters: |
- data (str): Input data. |
- cv_vect (numpy.ndarray): Vector representation of the input data. |
- df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions. |
Returns: |
- numpy.ndarray: Indices of job descriptions sorted in descending order of similarity. |
""" |
for i in range(0, len([data])): |
distances = cosine_similarity(cv_vect[i], df_vect).flatten() |
indices = np.argsort(distances)[::-1] |
return indices |
def fit_data(csv_path: str): |
""" |
Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors. |
Parameters: |
- csv_path (str): Path to the CSV file containing job descriptions. |
Returns: |
- pandas.DataFrame: DataFrame containing job descriptions. |
- sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object. |
- scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions. |
""" |
df = pd.read_csv(csv_path) |
x = df["concatenated_column"] |
y = df["label"] |
df.drop("concatenated_column", axis=1, inplace=True) |
vectorizer = TfidfVectorizer(stop_words='english') |
vectorizer.fit(x) |
df_vect = vectorizer.transform(x) |
return df, vectorizer, df_vect |
df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") ) |
def git_most_similar_job(cv_summarize: str, number_of_jobs: int): |
""" |
Finds the most similar job descriptions to the input CV summary. |
Parameters: |
- cv_summarize (str): Summary of the CV. |
- number_of_jobs (int): Number of similar job descriptions to return. |
Returns: |
- pandas.DataFrame: DataFrame containing the most similar job descriptions. |
""" |
cv_vect = vectorizer.transform([cv_summarize]) |
indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect) |
prediction_data = df.iloc[indices[:number_of_jobs]] |
print("ALL Done \n\n") |
return prediction_data |