File size: 4,724 Bytes
cca1a92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ef1980
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import tempfile
import fitz
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import google.generativeai as genai

load_dotenv() ## Load all the environment variables
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def extract_text_from_pdf(pdf_content):
    """
    Extracts text content from a PDF file.
    Parameters:
    - pdf_content (bytes): Bytes-like object containing the content of the PDF file.
    Returns:
    - str: Extracted text content from the PDF file.
    """
    text = ''
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(pdf_content)
        temp_path = temp_file.name

        pdf_document = fitz.open(temp_path)
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text += page.get_text()

    pdf_document.close()  # Close the PDF document explicitly
    os.remove(temp_path)  # Remove the temporary file after use
    return str(text.replace("\xa0", ""))


def generate_gemini_content(transcript_text):
    """
    Generates a summary based on the input text using Google's Gemini Pro model.
    Parameters:
    - transcript_text (str): Text to be summarized.
    Returns:
    - str: Generated summary.
    """
    prompt = """
    Instructions: 
        Please provide a concise summary of your relevant experience, skills, 
        and qualifications in the field of programming and technology. 
        Highlight your practical experience, technological proficiencies, technical skills, soft skills, 
        proficiency in programming languages and frameworks, as well as any other skills relevant to programming fields. 
        Additionally, include your location of residence and any other relevant details related to the programming industry 
        to facilitate accurate matching with job descriptions.
    Example summary:
        "Experienced software engineer with proficiency in Python, JavaScript, and Java. 
        Skilled in developing web applications using React.js and Django frameworks. 
        Strong problem-solving and communication skills. Located in New York City, 
        seeking opportunities in full-stack development to leverage my skills and contribute to innovative projects."
    CV is :
    """
    model = genai.GenerativeModel("gemini-pro")
    response = model.generate_content(prompt + transcript_text)
    return response.text


def git_indices(data, cv_vect, df_vect):
    """
    Computes cosine similarity between the vector representation of the input data and the vector representations of job descriptions.
    Parameters:
    - data (str): Input data.
    - cv_vect (numpy.ndarray): Vector representation of the input data.
    - df_vect (scipy.sparse.csr_matrix): Vector representations of job descriptions.
    Returns:
    - numpy.ndarray: Indices of job descriptions sorted in descending order of similarity.
    """
    for i in range(0, len([data])):
        distances = cosine_similarity(cv_vect[i], df_vect).flatten()
        indices = np.argsort(distances)[::-1]
    return indices


def fit_data(csv_path: str):
    """
    Reads and preprocesses job description data from a CSV file and creates TF-IDF vectors.
    Parameters:
    - csv_path (str): Path to the CSV file containing job descriptions.
    Returns:
    - pandas.DataFrame: DataFrame containing job descriptions.
    - sklearn.feature_extraction.text.TfidfVectorizer: TF-IDF vectorizer object.
    - scipy.sparse.csr_matrix: TF-IDF vectors of job descriptions.
    """
    df = pd.read_csv(csv_path)
    x = df["concatenated_column"]
    y = df["label"]
    df.drop("concatenated_column", axis=1, inplace=True)
    
    vectorizer = TfidfVectorizer(stop_words='english')
    
    vectorizer.fit(x)
    df_vect = vectorizer.transform(x)

    return df, vectorizer, df_vect
    
df, vectorizer, df_vect = fit_data(os.path.join(os.getcwd(), "all.csv") )



def git_most_similar_job(cv_summarize: str, number_of_jobs: int):
    """
    Finds the most similar job descriptions to the input CV summary.
    Parameters:
    - cv_summarize (str): Summary of the CV.
    - number_of_jobs (int): Number of similar job descriptions to return.
    Returns:
    - pandas.DataFrame: DataFrame containing the most similar job descriptions.
    """
    cv_vect = vectorizer.transform([cv_summarize])
    indices = git_indices(data=cv_summarize, cv_vect=cv_vect, df_vect=df_vect)
    
    prediction_data = df.iloc[indices[:number_of_jobs]]

    # Check if all threads have finished
    print("ALL Done \n\n")

    return prediction_data