reddgr commited on
Commit
bf54cdb
·
1 Parent(s): 7ddb0cb

English version

Browse files
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+ import gradio as gr
5
+ sys.path.append('src')
6
+ from procesador_de_cvs_con_llm import ProcesadorCV
7
+
8
+ use_dotenv = False
9
+ if use_dotenv:
10
+ from dotenv import load_dotenv
11
+ load_dotenv("../../../../../../../apis/.env")
12
+ api_key = os.getenv("OPENAI_API_KEY")
13
+
14
+ else:
15
+ api_key = os.getenv("OPENAI_API_KEY")
16
+
17
+ unmasked_chars = 8
18
+ masked_key = api_key[:unmasked_chars] + '*' * (len(api_key) - unmasked_chars*2) + api_key[-unmasked_chars:]
19
+ print(f"API key: {masked_key}")
20
+
21
+ def process_cv(job_text, cv_text, req_experience, req_experience_unit, positions_cap, dist_threshold_low, dist_threshold_high):
22
+ if dist_threshold_low >= dist_threshold_high:
23
+ return {"error": "dist_threshold_low must be lower than dist_threshold_high."}
24
+
25
+ if not isinstance(cv_text, str) or not cv_text.strip():
26
+ return {"error": "Please provide the CV or upload a file."}
27
+
28
+ # Convertir la experiencia requerida a meses si se introduce en años
29
+ if req_experience_unit == "years":
30
+ req_experience = req_experience * 12
31
+
32
+ try:
33
+ procesador = ProcesadorCV(api_key, cv_text, job_text, ner_pre_prompt,
34
+ system_prompt, user_prompt, ner_schema, response_schema)
35
+ dict_respuesta = procesador.procesar_cv_completo(
36
+ req_experience=req_experience,
37
+ positions_cap=positions_cap,
38
+ dist_threshold_low=dist_threshold_low,
39
+ dist_threshold_high=dist_threshold_high
40
+ )
41
+ return dict_respuesta
42
+ except Exception as e:
43
+ return {"error": f"Processing error: {str(e)}"}
44
+
45
+ # Parámetros de ejecución:
46
+ job_text = "Generative AI engineer"
47
+ cv_sample_path = 'cv_examples/reddgr_cv.txt' # Ruta al fichero de texto con un currículo de ejemplo
48
+ with open(cv_sample_path, 'r', encoding='utf-8') as file:
49
+ cv_text = file.read()
50
+ # Prompts:
51
+ with open('prompts/ner_pre_prompt.txt', 'r', encoding='utf-8') as f:
52
+ ner_pre_prompt = f.read()
53
+ with open('prompts/system_prompt.txt', 'r', encoding='utf-8') as f:
54
+ system_prompt = f.read()
55
+ with open('prompts/user_prompt.txt', 'r', encoding='utf-8') as f:
56
+ user_prompt = f.read()
57
+ # Esquemas JSON:
58
+ with open('json/ner_schema.json', 'r', encoding='utf-8') as f:
59
+ ner_schema = json.load(f)
60
+ with open('json/response_schema.json', 'r', encoding='utf-8') as f:
61
+ response_schema = json.load(f)
62
+
63
+ # Fichero de ejemplo para autocompletar (opción que aparece en la parte de abajo de la interfaz de usuario):
64
+ with open('cv_examples/reddgr_cv.txt', 'r', encoding='utf-8') as file:
65
+ cv_example = file.read()
66
+
67
+ default_parameters = [4, "years", 10, 0.5, 0.7] # Parámetros por defecto para el reinicio de la interfaz y los ejemplos predefinidos
68
+
69
+ # Código CSS para truncar el texto de ejemplo en la interfaz (bloque "Examples" en la parte de abajo):
70
+ css = """
71
+ table tbody tr {
72
+ height: 2.5em; /* Set a fixed height for the rows */
73
+ overflow: hidden; /* Hide overflow content */
74
+ }
75
+
76
+ table tbody tr td {
77
+ overflow: hidden; /* Ensure content within cells doesn't overflow */
78
+ text-overflow: ellipsis; /* Add ellipsis for overflowing text */
79
+ white-space: nowrap; /* Prevent text from wrapping */
80
+ vertical-align: middle; /* Align text vertically within the fixed height */
81
+ }
82
+ """
83
+
84
+ # Interfaz Gradio:
85
+ with gr.Blocks(css=css) as interface:
86
+ # Inputs
87
+ job_text_input = gr.Textbox(label="Vacancy Title", lines=1, placeholder="Enter the vacancy title")
88
+ gr.Markdown("Required Experience")
89
+ with gr.Row():
90
+ req_experience_input = gr.Number(label="Required Experience", value=default_parameters[0], precision=0, elem_id="req_exp", show_label=False)
91
+ req_experience_unit = gr.Dropdown(label="Period", choices=["months", "years"], value=default_parameters[1], elem_id="req_exp_unit", show_label=False)
92
+ cv_text_input = gr.Textbox(label="CV in Text Format", lines=5, max_lines=5, placeholder="Enter the CV text")
93
+
94
+ # Opciones avanzadas ocultas en un objeto "Accordion"
95
+ with gr.Accordion("Advanced options", open=False):
96
+ positions_cap_input = gr.Number(label="Maximum number of positions to extract", value=default_parameters[2], precision=0)
97
+ dist_threshold_low_slider = gr.Slider(
98
+ label="Minimum embedding distance threshold (equivalent position)",
99
+ minimum=0, maximum=1, value=default_parameters[3], step=0.05
100
+ )
101
+ dist_threshold_high_slider = gr.Slider(
102
+ label="Maximum embedding distance threshold (irrelevant position)",
103
+ minimum=0, maximum=1, value=default_parameters[4], step=0.05
104
+ )
105
+
106
+ submit_button = gr.Button("Process")
107
+ clear_button = gr.Button("Clear")
108
+
109
+ output_json = gr.JSON(label="Result")
110
+
111
+ # Ejemplos:
112
+ examples = gr.Examples(
113
+ examples=[
114
+ ["Supermarket cashier", "Deli worker since 2021. Previously worked 2 months as a waiter in a tapas bar."] + default_parameters,
115
+ ["Generative AI Engineer", cv_example] + default_parameters
116
+ ],
117
+ inputs=[job_text_input, cv_text_input, req_experience_input, req_experience_unit, positions_cap_input, dist_threshold_low_slider, dist_threshold_high_slider]
118
+ )
119
+
120
+ # Botón "Procesar"
121
+ submit_button.click(
122
+ fn=process_cv,
123
+ inputs=[
124
+ job_text_input,
125
+ cv_text_input,
126
+ req_experience_input,
127
+ req_experience_unit,
128
+ positions_cap_input,
129
+ dist_threshold_low_slider,
130
+ dist_threshold_high_slider
131
+ ],
132
+ outputs=output_json
133
+ )
134
+
135
+ # Botón "Limpiar"
136
+ clear_button.click(
137
+ fn=lambda: ("","",*default_parameters),
138
+ inputs=[],
139
+ outputs=[
140
+ job_text_input,
141
+ cv_text_input,
142
+ req_experience_input,
143
+ req_experience_unit,
144
+ positions_cap_input,
145
+ dist_threshold_low_slider,
146
+ dist_threshold_high_slider
147
+ ]
148
+ )
149
+
150
+ # Footer
151
+ gr.Markdown("""
152
+ <footer>
153
+ <p>You can view the complete code for this app and the explanatory notebooks on
154
+ <a href='https://github.com/reddgr/procesador-de-curriculos-cv' target='_blank'>GitHub</a></p>
155
+ <p>© 2024 <a href='https://talkingtochatbots.com' target='_blank'>talkingtochatbots.com</a></p>
156
+ </footer>
157
+ """)
158
+
159
+ # Lanzar la aplicación:
160
+ if __name__ == "__main__":
161
+ interface.launch()
cv_examples/reddgr_cv.txt ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ www.linkedin.com/in/davidgonzalezromero (LinkedIn)
2
+ talkingtochatbots.com (Personal)
3
+ Top Skills
4
+ Information and CommunicationsTechnology (ICT)
5
+ Data Science
6
+ Artificial Intelligence (AI)
7
+ Languages
8
+ English (Full Professional)
9
+ French (Limited Working)
10
+ Spanish (Native or Bilingual)
11
+ Certifications
12
+ Watson Analytics - Level 1
13
+ Retail Industry Jumpstart
14
+ Data Science Foundations - Level 1
15
+ Generative AI Imaging: WhatCreative Pros Need to Know
16
+ Prompt Engineering: How to Talk tothe AIs
17
+ Honors-Awards
18
+ Desafío Entorno Pre Mercado 2023
19
+ David González Romero
20
+ ICT Engineer | Business Consultant | Licensed Financial ServicesProfessional | Web Publisher | Profile not suggested by AI | 499connections | reddgr
21
+ Greater Madrid Metropolitan Area
22
+ Summary
23
+ I am an Information and Communications Technology Engineer andBusiness Consultant with over 15 years of experience in enterprisesoftware solutions, consulting, business analytics, and data science,across multiple countries and cross-functional teams.Over the last two decades, I have enjoyed the privilege of travelingaround the world, teaming up with outstanding professionals,leading teams, developing business opportunities, and buildingand managing longstanding client relationships. Throughout myconsulting and client relationship management career, I haveprimarily served clients in the retail industry, financial services,telecommunications, and the public sector, developing skills andknowledge across diverse domains such as marketing, finance, riskmanagement, software engineering, and data science.In academia, I completed MSc studies in telecommunications,electrical, and computer engineering, with research in ubiquitouscomputing, the Internet of Things, and computer security. Currently,I'm pursuing a master's degree in Artificial Intelligence Applied toFinancial Markets, and managing Talking to Chatbots, a websitededicated to generative AI projects, popular culture, and education,available at https://talkingtochatbots.com.
24
+ Experience
25
+ Talking to Chatbots, by Reddgr
26
+ Web Publisher and Generative AI Researcher
27
+ October 2006 - Present (18 years 3 months)
28
+ Spain
29
+ Developed and managed personal projects on the Internet since 2006.Currently managing the internet domains https://talkingtochatbots.com(website) and https://reddgr.com (search engine and social media keyword:
30
+ Page 1 of 6
31
+ “reddgr” stands for “David González Romero network”). Talking to Chatbotsis a knowledge hub that compiles LLM prompts and curated conversations,serving as an entertainment and educational platform for AI hobbyists,learners, and professionals.Since 2023, active developer and contributor in open-source and proprietarygenerative AI platforms and communities (reddgr.com/gpts huggingface.com/reddgr)
32
+ Acoustic
33
+ Principal Consultant | Martech SaaS
34
+ June 2020 - May 2023 (3 years)
35
+ Spain
36
+ Advised retail companies on implementing profitable, competitive pricingstrategies and promotions backed by DemandTec software as a service(SaaS) solutions and data science. Primary focus on developing and leadingsuccessful client relationships. Dedicated to continuously improving Acousticproducts and ensuring that clients and prospects receive excellent servicefrom Acoustic’s team and business partners. This involved delivering andmanaging: consultative selling, employee recruitment, training and mentoring,solution implementations, SaaS managed services (data integration andmodeling), technical support, customer relationship management (CRM), andanalytics consulting services.Companies served by the team of consultants and account managers I ledinclude: leading Spanish retailer; Italian supermarket cooperative; multinationalretail company operating in the Middle East, Eastern Europe and Africa;leading Italian retail group; Swedish grocery retailer; online retailer operating inthe UK and Ireland; British retailer with multinational presence; Finland-basedmultinational retail company; leading Norwegian retailer; retail cooperativein the Nordic countries; major multinational retail group operating in SouthAmerica.Participated in or led pre-sales activities (RFI, RFP, POC, RFQ) for variousnational and multinational retailers based in Southern Europe, Central Europe,Nordics, the Middle East and Australia.
37
+ IBM
38
+ 7 years 3 months
39
+ Engagement Manager, in support of Acoustic | B2B SaaS Retail Analytics
40
+ Page 2 of 6
41
+ July 2019 - May 2020 (11 months)
42
+ Madrid, Community of Madrid, Spain
43
+ Employed by IBM exclusively in support of Acoustic, new company founded in2019 by a team of IBM Watson Marketing & Commerce software specialists,led by former IBM executives and funded by private equity. Specialist inAcoustic Pricing and Promotion solutions (DemandTec), acting as AcousticSoftware Services team leader in Spain, and as software delivery EngagementManager and Subject Matter Expert for pre-sales and services projectsworldwide. Acoustic clients I worked with include: leading Spanish retailer, supermarketcooperative based in central and southern Italy, multinational retail companyoperating in Middle East, Eastern Europe and Africa, leading Italian retailgroup.
44
+ Engagement Manager | B2B SaaS Retail Analytics
45
+ September 2018 - June 2019 (10 months)
46
+ Madrid, Community of Madrid, Spain
47
+ Managing services projects and SaaS engagements for IBM WatsonCommerce solutions. As cognitive solutions specialist and SME in retail pricingand business analytics, I helped IBM clients succeed by coordinating allcomponents of the IBM Omni-Channel Pricing (DemandTec) cloud-basedsolution, including: solution design and PoC's, solution implementation anddelivery, data science services, data integration services, SaaS operations,technical support, product management, benefits assessments, and analyticalconsulting services.IBM clients I worked with as Engagement Manager or SME include: leadingSpanish retailer, multinational retail company operating in Middle East, EasternEurope and Africa, leading Italian retail group, Italian supermarket cooperative.
48
+ Relationship Manager | Cognitive Solutions SaaS
49
+ January 2015 - August 2018 (3 years 8 months)
50
+ Madrid, Community of Madrid, Spain
51
+ Specialist in the IBM Omni-channel Merchandising (DemandTec) solutionfor the retail industry, including Price Optimization, Promotion Planning andDynamic Pricing software.Managed the day to day relationship with assigned clients (€ 2 million ARR),prospecting and coordinating the delivery of SaaS platform enablementservices (data integration and data science), technical support, project
52
+ Page 3 of 6
53
+ management, and end-user enablement. Collaborated in other internationalprojects as DemandTec and pricing SME, delivering training and projectguidance to client end-users and business partners.IBM clients I worked with include: multiple Merchandising divisions of leadingSpanish retailer, multinational supermarket chain based in Spain, supermarketco-operative based in Denmark, multinational retail company operating inMiddle East, Eastern Europe and Africa, Russian supermarket chain, Finland-based retail company, British consumer co-operative.
54
+ Business Analyst | B2B SaaS Retail Analytics
55
+ March 2013 - December 2014 (1 year 10 months)
56
+ Madrid, Community of Madrid, Spain
57
+ Delivery of IBM Enterprise Marketing Management implementation projects,including DemandTec Price Optimization, Markdown Optimization andAssortment Optimization SaaS solutions. Delivered business and technicalguidance to pricing managers, category managers, buyers and businessconsultants in solution architecture, problem management resolution andchange management. Specialist in performing data analysis on the datascience, optimization and business analytics tools and services included in thesolution.IBM clients I worked with include: leading Spanish retailer, multinationalfashion retailer based in Spain, US-based sports retailer, supermarket co-operative based in Denmark.
58
+ KPMG España
59
+ Senior Consultant | Financial Risk Management
60
+ December 2010 - March 2013 (2 years 4 months)
61
+ Madrid, Community of Madrid, Spain
62
+ Senior Consultant in Financial Risk Management. Main projects:• Corporate and Investment Banking financial reporting: data mining andanalytics for Finance and Business Performance & Analytics department atleading multinational banking and financial services company. Developed andmaintained financial reports and insights for CFO, senior management andfront office.• Retail and Business Banking credit risk modeling: led user acceptancetesting and test case development for credit risk models and EBA-compliantreporting (COREP) of capital requirements. Led UAT development team and
63
+ Page 4 of 6
64
+ acted as a link between IT teams and Risk department. Supported credit riskmodeling team on early implementation of internal ratings-based (IRB) creditrisk models in compliance with Basel Framework on banking supervision.
65
+ MBD Analytics
66
+ Business Intelligence Consultant
67
+ February 2010 - December 2010 (11 months)
68
+ Alcobendas, Community of Madrid, Spain
69
+ Marketing Business Intelligence consulting services. Client-facing consultantfor Competitive Intelligence department at a multinational telecommunicationscompany. Responsible for the development of custom BI reporting solutionsand presenting monthly business reports. The reports included insights,analysis and forecasting of KPIs measuring customer activity and value inconsumer and enterprise telecommunication services.
70
+ Grupo Eneas
71
+ Cost Analyst
72
+ November 2009 - December 2009 (2 months)
73
+ Madrid, Community of Madrid, Spain
74
+ Telecommunications cost optimization project for a regional governmentagency of Spain. Gathered and analyzed invoice and contract data in supportof a Request for Quotation (RFQ) to a selection of telecommunication serviceproviders.
75
+ Deloitte España
76
+ IT Strategy Consultant
77
+ September 2008 - January 2009 (5 months)
78
+ Madrid, Community of Madrid, Spain
79
+ Management consulting intern, collaborating on IT management projects forinsurance, banking and public sector companies and institutions based inSpain. Collaborated on research, documentation, elaboration of proposals andtechnical support for IT management consulting projects, including: IT strategicplanning and market research, IT service management, IT integration, and ITcost optimization.
80
+ Education
81
+ Illinois Institute of Technology
82
+ Research Scholar, Electrical & Computer Engineering · (2009 - 2009)
83
+ Page 5 of 6
84
+ Universidad Politécnica de Madrid
85
+ Master of Science (MSc), Telecommunications Engineer · (2003 - 2009)
86
+ Instituto BME
87
+ Master's degree, Artificial Intelligence Applied to Financial Markets(MIAX) · (October 2023 - May 2025)
88
+ Page 6 of 6
json/ner_schema.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "object",
3
+ "properties": {
4
+ "experience": {
5
+ "type": "array",
6
+ "items": {
7
+ "type": "object",
8
+ "properties": {
9
+ "company": {"type": "string"},
10
+ "role": {"type": "string"},
11
+ "period": {
12
+ "type": "string",
13
+ "description": "'YYYYMM-YYYYMM' format or simply 'YYYYMM' if no end date is given."
14
+ }
15
+ },
16
+ "required": ["company", "role", "period"]
17
+ }
18
+ }
19
+ },
20
+ "required": ["experience"]
21
+ }
json/response_schema.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "object",
3
+ "properties": {
4
+ "score": {
5
+ "type": "number"
6
+ },
7
+ "relevant experience": {
8
+ "type": "array",
9
+ "items": {
10
+ "type": "object",
11
+ "properties": {
12
+ "company": {
13
+ "type": "string"
14
+ },
15
+ "role": {
16
+ "type": "string"
17
+ },
18
+ "duration": {
19
+ "type": "integer"
20
+ }
21
+ },
22
+ "required": [
23
+ "company",
24
+ "role",
25
+ "duration"
26
+ ]
27
+ }
28
+ },
29
+ "experience summary": {
30
+ "type": "string"
31
+ }
32
+ },
33
+ "required": [
34
+ "score",
35
+ "relevant experience",
36
+ "experience summary"
37
+ ]
38
+ }
pkl/df_ejemplos_con_distancia.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41724c6196d4862a3b08d417024549c4f4f644fdc031d7eda324dc715fa14824
3
+ size 1623
pkl/df_experiencia.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:726aabfa90eddf805b685ab14231dc18fbdabf1033d7c4649dddcaa0f2936454
3
+ size 1501
prompts/ner_pre_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ You are a resume processor that extracts job titles, company names, and their corresponding periods. Use JSON format in the output with the keys "empresa", "puesto", and "periodo". For the period, consider any date format or range of dates included in the text. An example of a date format in the input is "October 2023 / March 2024". The value for the "periodo" key should be a string with two elements in YYYYMM format separated by a hyphen, for example "202310-202403", or only one element if no end date is identified.
prompts/system_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ You are a curriculum vitae processor that receives a job offer, a filtered curriculum vitae, relevant previous experience, a precalculated score for the curriculum between 0 and 100, and a required experience parameter in months. The score has been calculated using an algorithm that utilizes embedding distances between each role and the job offer's definition, as well as the duration of each role and its relationship with the required experience parameter. It returns an object with a predefined schema, including exactly the provided score, the given list of experiences, and additionally a brief explanatory text about the candidate's experience and why they have obtained the given score. It is important that the explanatory text is coherent with the score. For example, if the score is greater than 80, the explanatory text should emphasize the candidate's past experiences and the duration of those experiences that have led to that score. When mentioning any experience duration exceeding 12 months, include in the text only an approximate conversion to years, as the exact data is provided in the attached experience list.
prompts/user_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The vacancy title is: {job}. The required experience in months is {req_experience}. The score is {puntuacion}. The relevant experience is: {exp}. Explain why the score was obtained.
src/__pycache__/procesador_de_cvs_con_llm.cpython-311.pyc ADDED
Binary file (17 kB). View file
 
src/procesador_de_cvs_con_llm.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import pandas as pd
4
+ import json
5
+ import textwrap
6
+ from scipy import spatial
7
+ from datetime import datetime
8
+ from openai import OpenAI
9
+
10
+ class ProcesadorCV:
11
+
12
+ def __init__(self, api_key, cv_text, job_text, ner_pre_prompt, system_prompt, user_prompt, ner_schema, response_schema,
13
+ inference_model="gpt-4o-mini", embeddings_model="text-embedding-3-small"):
14
+ """
15
+ Initializes an instance of the class with the provided parameters.
16
+
17
+ Args:
18
+ api_key (str): The API key to authenticate with the OpenAI client.
19
+ cv_text (str): CV content in text format.
20
+ job_text (str): title of the job offer to evaluate.
21
+ ner_pre_prompt (str): "Named Entity Recognition" (NER) instruction for the natural language model.
22
+ system_prompt (str): natural language instruction for the final structured output.
23
+ user_prompt (str): instruction with parameters and data calculated in preprocessing.
24
+ ner_schema (dict): schema for the "structured outputs" call to the OpenAI model for NER.
25
+ response_schema (dict): schema for the final application response.
26
+ inference_model (str, optional): The inference model to use. Default is "gpt-4o-mini".
27
+ embeddings_model (str, optional): The embeddings model to use. Default is "text-embedding-3-small".
28
+
29
+ Attributes:
30
+ inference_model (str): Stores the selected inference model.
31
+ embeddings_model (str): Stores the selected embeddings model.
32
+ client (OpenAI): Instance of the OpenAI client initialized with the provided API key.
33
+ cv (str): Stores the provided curriculum vitae text.
34
+ """
35
+ self.inference_model = inference_model
36
+ self.embeddings_model = embeddings_model
37
+ self.ner_pre_prompt = ner_pre_prompt
38
+ self.user_prompt = user_prompt
39
+ self.system_prompt = system_prompt
40
+ self.ner_schema = ner_schema
41
+ self.response_schema = response_schema
42
+ self.client = OpenAI(api_key=api_key)
43
+ self.cv = cv_text
44
+ self.job_text = job_text
45
+ print("Cliente inicializado como",self.client)
46
+
47
+ def extraer_datos_cv(self, temperature=0.5):
48
+ """
49
+ Extracts structured data from a CV using OpenAI API.
50
+ Args:
51
+ temperature (float, optional): temperature value for the language model. Default is 0.5.
52
+ Returns:
53
+ pd.DataFrame: DataFrame with structured data extracted from the CV.
54
+ Raises:
55
+ ValueError: if structured data cannot be extracted from the CV.
56
+ """
57
+ response = self.client.chat.completions.create(
58
+ model=self.inference_model,
59
+ temperature=temperature,
60
+ messages=[
61
+ {"role": "system", "content": self.ner_pre_prompt},
62
+ {"role": "user", "content": self.cv}
63
+ ],
64
+ functions=[
65
+ {
66
+ "name": "extraer_datos_cv",
67
+ "description": "Extracts table with job titles, company names and periods from a CV.",
68
+ "parameters": self.ner_schema
69
+ }
70
+ ],
71
+ function_call="auto"
72
+ )
73
+
74
+ if response.choices[0].message.function_call:
75
+ function_call = response.choices[0].message.function_call
76
+ structured_output = json.loads(function_call.arguments)
77
+ if structured_output.get("experience"):
78
+ df_cv = pd.DataFrame(structured_output["experience"])
79
+ return df_cv
80
+ else:
81
+ raise ValueError(f"Unable to extract structured data: {response.choices[0].message.content}")
82
+ else:
83
+ raise ValueError(f"Unable to extract structured data: {response.choices[0].message.content}")
84
+
85
+
86
+ def procesar_periodos(self, df):
87
+ """
88
+ Process periods in a DataFrame and adds columns with start dates, end dates, and duration in months.
89
+ If there is no end date, the current date is considered.
90
+ df (pandas.DataFrame): DataFrame containing a 'period' column with periods in 'YYYYMM-YYYYMM' or 'YYYYMM' format.
91
+ pandas.DataFrame: DataFrame with additional columns 'fec_inicio', 'fec_final', and 'duracion'.
92
+ - 'fec_inicio' (datetime.date): Start date of the period.
93
+ - 'fec_final' (datetime.date): End date of the period.
94
+ - 'duracion' (int): Duration of the period in months.
95
+ """
96
+ # Función lambda para procesar el período
97
+ def split_period(period):
98
+ dates = period.split('-')
99
+ start_date = datetime.strptime(dates[0], "%Y%m")
100
+ if len(dates) > 1:
101
+ end_date = datetime.strptime(dates[1], "%Y%m")
102
+ else:
103
+ end_date = datetime.now()
104
+ return start_date, end_date
105
+
106
+ df[['fec_inicio', 'fec_final']] = df['period'].apply(lambda x: pd.Series(split_period(x)))
107
+
108
+ # Formateamos las fechas para mostrar mes, año, y el primer día del mes (dado que el día es irrelevante y no se suele especificar)
109
+ df['fec_inicio'] = df['fec_inicio'].dt.date
110
+ df['fec_final'] = df['fec_final'].dt.date
111
+
112
+ # Añadimos una columna con la duración en meses
113
+ df['duracion'] = df.apply(
114
+ lambda row: (row['fec_final'].year - row['fec_inicio'].year) * 12 +
115
+ row['fec_final'].month - row['fec_inicio'].month,
116
+ axis=1
117
+ )
118
+
119
+ return df
120
+
121
+
122
+ def calcular_embeddings(self, df, column='role', model_name='text-embedding-3-small'):
123
+ """
124
+ Calculates the embeddings for a column in a dataframe using the OpenAI API.
125
+
126
+ df (pandas.DataFrame): DataFrame containing the CV data.
127
+ column (str, optional): Name of the column containing the text to be converted to embeddings. Default is 'role'.
128
+ model_name (str, optional): Name of the embeddings model. Default is 'text-embedding-3-small'.
129
+
130
+ Returns:
131
+ pandas.DataFrame: DataFrame with an additional 'embeddings' column containing the generated embeddings.
132
+ """
133
+ df['embeddings'] = df[column].apply(
134
+ lambda puesto: self.client.embeddings.create(
135
+ input=puesto,
136
+ model=model_name
137
+ ).data[0].embedding
138
+ )
139
+ return df
140
+
141
+
142
+ def calcular_distancias(self, df, column='embeddings', model_name='text-embedding-3-small'):
143
+ """
144
+ Calculates the cosine distance between the text embeddings and those included in a DataFrame column.
145
+ Parameters:
146
+ -----------
147
+ df : pandas.DataFrame
148
+ DataFrame containing the embeddings.
149
+ column : str, optional
150
+ Name of the DataFrame column containing the embeddings. Default is 'embeddings'.
151
+ model_name : str, optional
152
+ OpenAI API embedding model. Default is "text-embedding-3-small".
153
+ --------
154
+ pandas.DataFrame
155
+ DataFrame sorted by distance in ascending order, with distances added as a new column.
156
+ """
157
+ response = self.client.embeddings.create(
158
+ input=self.job_text,
159
+ model=model_name
160
+ )
161
+ emb_compare = response.data[0].embedding
162
+
163
+ df['distancia'] = df[column].apply(lambda emb: spatial.distance.cosine(emb, emb_compare))
164
+ df.drop(columns=[column], inplace=True)
165
+ df.sort_values(by='distancia', ascending=True, inplace=True)
166
+ return df
167
+
168
+
169
+ def calcular_puntuacion(self, df, req_experience, positions_cap=4, dist_threshold_low=0.6, dist_threshold_high=0.7):
170
+ """
171
+
172
+ Calculates the score of a CV based on its distance table (relative to a given position) and durations.
173
+ Parameters:
174
+ ----------
175
+ df : pandas.DataFrame
176
+ CV data including different experiences with durations and distances previously calculated based on the embeddings of a job position.
177
+ req_experience : float
178
+ Required experience in months for the job position (reference value to calculate a score between 0 and 100 based on different experiences).
179
+ positions_cap : int, optional
180
+ Maximum number of positions to consider for scoring. Defaults to 4.
181
+ dist_threshold_low : float, optional
182
+ Distance between embeddings below which the CV position is considered "equivalent" to the job offer. Defaults to 0.6.
183
+ dist_threshold_high : float, optional
184
+ Distance between embeddings above which the CV position does not score. Defaults to 0.7.
185
+ -------
186
+ pandas.DataFrame
187
+ Original DataFrame with an additional column containing individual scores contributed by each position.
188
+ float
189
+ Total score between 0 and 100.
190
+ """
191
+ # A efectos de puntuación, computamos para cada puesto como máximo el número total de meses de experiencia requeridos
192
+ df['duration_capped'] = df['duracion'].apply(lambda x: min(x, req_experience))
193
+ # Normalizamos la distancia entre 0 y 1, siendo 0 la distancia mínima y 1 la máxima
194
+ df['adjusted_distance'] = df['distancia'].apply(
195
+ lambda x: 0 if x <= dist_threshold_low else (
196
+ 1 if x >= dist_threshold_high else (x - dist_threshold_low) / (dist_threshold_high - dist_threshold_low)
197
+ )
198
+ )
199
+ # Cada puesto puntúa en base a su duración y a la inversa de la distancia (a menor distancia, mayor puntuación)
200
+ df['position_score'] = round(((1 - df['adjusted_distance']) * (df['duration_capped']/req_experience) * 100), 2)
201
+ # Descartamos puestos con distancia superior al umbral definido (asignamos puntuación 0), y ordenamos por puntuación
202
+ df.loc[df['distancia'] >= dist_threshold_high, 'position_score'] = 0
203
+ df = df.sort_values(by='position_score', ascending=False)
204
+ # Nos quedamos con los puestos con mayor puntuación (positions_cap)
205
+ df.iloc[positions_cap:, df.columns.get_loc('position_score')] = 0
206
+ # Totalizamos (no debería superar 100 nunca, pero ponemos un límite para asegurar) y redondeamos a dos decimales
207
+ total_score = round(min(df['position_score'].sum(), 100), 2)
208
+ return df, total_score
209
+
210
+ def filtra_experiencia_relevante(self, df):
211
+ """
212
+ Filters the relevant experiences from the dataframe and returns them in dictionary format.
213
+ Args:
214
+ df (pandas.DataFrame): DataFrame with complete experience information.
215
+ Returns:
216
+ dict: Dictionary with the relevant experiences.
217
+ """
218
+ df_experiencia = df[df['position_score'] > 0].copy()
219
+ df_experiencia.drop(columns=['period', 'fec_inicio', 'fec_final',
220
+ 'distancia', 'duration_capped', 'adjusted_distance'], inplace=True)
221
+ experiencia_dict = df_experiencia.to_dict(orient='list')
222
+ return experiencia_dict
223
+
224
+ def llamada_final(self, req_experience, puntuacion, dict_experiencia):
225
+ """
226
+ Makes the final call to the language model to generate the final response.
227
+ req_experience (int): Required experience in months for the job position.
228
+ puntuacion (float): Total score of the CV.
229
+ dict_experiencia (dict): Dictionary with relevant experiences.
230
+ dict: Dictionary with the final response.
231
+ Raises:
232
+ ValueError: If no response is generated by the language model.
233
+ """
234
+ messages = [
235
+ {
236
+ "role": "system",
237
+ "content": self.system_prompt
238
+ },
239
+ {
240
+ "role": "user",
241
+ "content": self.user_prompt.format(job=self.job_text, req_experience=req_experience,puntuacion=puntuacion, exp=dict_experiencia)
242
+ }
243
+ ]
244
+
245
+ functions = [
246
+ {
247
+ "name": "respuesta_formateada",
248
+ "description": "Returns an object with score, experience and description of the experience",
249
+ "parameters": self.response_schema
250
+ }
251
+ ]
252
+
253
+ response = self.client.chat.completions.create(
254
+ model=self.inference_model,
255
+ temperature=0.5,
256
+ messages=messages,
257
+ functions=functions,
258
+ function_call={"name": "respuesta_formateada"}
259
+ )
260
+
261
+ if response.choices[0].message.function_call:
262
+ function_call = response.choices[0].message.function_call
263
+ structured_output = json.loads(function_call.arguments)
264
+ print("Response:\n", json.dumps(structured_output, indent=4, ensure_ascii=False))
265
+ wrapped_description = textwrap.fill(structured_output['experience summary'], width=120)
266
+ print(f"Experience summary:\n{wrapped_description}")
267
+ return structured_output
268
+ else:
269
+ raise ValueError(f"Error. No response was generated:\n {response.choices[0].message.content}")
270
+
271
+ def procesar_cv_completo(self, req_experience, positions_cap, dist_threshold_low, dist_threshold_high):
272
+ '''
273
+ Processes a CV and calculates the final score.
274
+ req_experience (int, optional): Required experience in months for the job position.
275
+ positions_cap (int, optional): Maximum number of positions to consider for scoring.
276
+ dist_threshold_low (float, optional): Distance limit to consider a position equivalent.
277
+ dist_threshold_high (float, optional): Distance limit to consider a position not relevant.
278
+ Returns:
279
+ dict: Dictionary with the final answer.
280
+ '''
281
+ df_datos_estructurados_cv = self.extraer_datos_cv()
282
+ df_datos_estructurados_cv = self.procesar_periodos(df_datos_estructurados_cv)
283
+ df_con_embeddings = self.calcular_embeddings(df_datos_estructurados_cv)
284
+ df_con_distancias = self.calcular_distancias(df_con_embeddings)
285
+ df_puntuaciones, puntuacion = self.calcular_puntuacion(df_con_distancias,
286
+ req_experience=req_experience,
287
+ positions_cap=positions_cap,
288
+ dist_threshold_low=dist_threshold_low,
289
+ dist_threshold_high=dist_threshold_high)
290
+ dict_experiencia = self.filtra_experiencia_relevante(df_puntuaciones)
291
+ dict_respuesta = self.llamada_final(req_experience, puntuacion, dict_experiencia)
292
+ return dict_respuesta