English version
Browse files- app.py +161 -0
- cv_examples/reddgr_cv.txt +88 -0
- json/ner_schema.json +21 -0
- json/response_schema.json +38 -0
- pkl/df_ejemplos_con_distancia.pkl +3 -0
- pkl/df_experiencia.pkl +3 -0
- prompts/ner_pre_prompt.txt +1 -0
- prompts/system_prompt.txt +1 -0
- prompts/user_prompt.txt +1 -0
- src/__pycache__/procesador_de_cvs_con_llm.cpython-311.pyc +0 -0
- src/procesador_de_cvs_con_llm.py +292 -0
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import gradio as gr
|
5 |
+
sys.path.append('src')
|
6 |
+
from procesador_de_cvs_con_llm import ProcesadorCV
|
7 |
+
|
8 |
+
use_dotenv = False
|
9 |
+
if use_dotenv:
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
load_dotenv("../../../../../../../apis/.env")
|
12 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
13 |
+
|
14 |
+
else:
|
15 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
unmasked_chars = 8
|
18 |
+
masked_key = api_key[:unmasked_chars] + '*' * (len(api_key) - unmasked_chars*2) + api_key[-unmasked_chars:]
|
19 |
+
print(f"API key: {masked_key}")
|
20 |
+
|
21 |
+
def process_cv(job_text, cv_text, req_experience, req_experience_unit, positions_cap, dist_threshold_low, dist_threshold_high):
|
22 |
+
if dist_threshold_low >= dist_threshold_high:
|
23 |
+
return {"error": "dist_threshold_low must be lower than dist_threshold_high."}
|
24 |
+
|
25 |
+
if not isinstance(cv_text, str) or not cv_text.strip():
|
26 |
+
return {"error": "Please provide the CV or upload a file."}
|
27 |
+
|
28 |
+
# Convertir la experiencia requerida a meses si se introduce en años
|
29 |
+
if req_experience_unit == "years":
|
30 |
+
req_experience = req_experience * 12
|
31 |
+
|
32 |
+
try:
|
33 |
+
procesador = ProcesadorCV(api_key, cv_text, job_text, ner_pre_prompt,
|
34 |
+
system_prompt, user_prompt, ner_schema, response_schema)
|
35 |
+
dict_respuesta = procesador.procesar_cv_completo(
|
36 |
+
req_experience=req_experience,
|
37 |
+
positions_cap=positions_cap,
|
38 |
+
dist_threshold_low=dist_threshold_low,
|
39 |
+
dist_threshold_high=dist_threshold_high
|
40 |
+
)
|
41 |
+
return dict_respuesta
|
42 |
+
except Exception as e:
|
43 |
+
return {"error": f"Processing error: {str(e)}"}
|
44 |
+
|
45 |
+
# Parámetros de ejecución:
|
46 |
+
job_text = "Generative AI engineer"
|
47 |
+
cv_sample_path = 'cv_examples/reddgr_cv.txt' # Ruta al fichero de texto con un currículo de ejemplo
|
48 |
+
with open(cv_sample_path, 'r', encoding='utf-8') as file:
|
49 |
+
cv_text = file.read()
|
50 |
+
# Prompts:
|
51 |
+
with open('prompts/ner_pre_prompt.txt', 'r', encoding='utf-8') as f:
|
52 |
+
ner_pre_prompt = f.read()
|
53 |
+
with open('prompts/system_prompt.txt', 'r', encoding='utf-8') as f:
|
54 |
+
system_prompt = f.read()
|
55 |
+
with open('prompts/user_prompt.txt', 'r', encoding='utf-8') as f:
|
56 |
+
user_prompt = f.read()
|
57 |
+
# Esquemas JSON:
|
58 |
+
with open('json/ner_schema.json', 'r', encoding='utf-8') as f:
|
59 |
+
ner_schema = json.load(f)
|
60 |
+
with open('json/response_schema.json', 'r', encoding='utf-8') as f:
|
61 |
+
response_schema = json.load(f)
|
62 |
+
|
63 |
+
# Fichero de ejemplo para autocompletar (opción que aparece en la parte de abajo de la interfaz de usuario):
|
64 |
+
with open('cv_examples/reddgr_cv.txt', 'r', encoding='utf-8') as file:
|
65 |
+
cv_example = file.read()
|
66 |
+
|
67 |
+
default_parameters = [4, "years", 10, 0.5, 0.7] # Parámetros por defecto para el reinicio de la interfaz y los ejemplos predefinidos
|
68 |
+
|
69 |
+
# Código CSS para truncar el texto de ejemplo en la interfaz (bloque "Examples" en la parte de abajo):
|
70 |
+
css = """
|
71 |
+
table tbody tr {
|
72 |
+
height: 2.5em; /* Set a fixed height for the rows */
|
73 |
+
overflow: hidden; /* Hide overflow content */
|
74 |
+
}
|
75 |
+
|
76 |
+
table tbody tr td {
|
77 |
+
overflow: hidden; /* Ensure content within cells doesn't overflow */
|
78 |
+
text-overflow: ellipsis; /* Add ellipsis for overflowing text */
|
79 |
+
white-space: nowrap; /* Prevent text from wrapping */
|
80 |
+
vertical-align: middle; /* Align text vertically within the fixed height */
|
81 |
+
}
|
82 |
+
"""
|
83 |
+
|
84 |
+
# Interfaz Gradio:
|
85 |
+
with gr.Blocks(css=css) as interface:
|
86 |
+
# Inputs
|
87 |
+
job_text_input = gr.Textbox(label="Vacancy Title", lines=1, placeholder="Enter the vacancy title")
|
88 |
+
gr.Markdown("Required Experience")
|
89 |
+
with gr.Row():
|
90 |
+
req_experience_input = gr.Number(label="Required Experience", value=default_parameters[0], precision=0, elem_id="req_exp", show_label=False)
|
91 |
+
req_experience_unit = gr.Dropdown(label="Period", choices=["months", "years"], value=default_parameters[1], elem_id="req_exp_unit", show_label=False)
|
92 |
+
cv_text_input = gr.Textbox(label="CV in Text Format", lines=5, max_lines=5, placeholder="Enter the CV text")
|
93 |
+
|
94 |
+
# Opciones avanzadas ocultas en un objeto "Accordion"
|
95 |
+
with gr.Accordion("Advanced options", open=False):
|
96 |
+
positions_cap_input = gr.Number(label="Maximum number of positions to extract", value=default_parameters[2], precision=0)
|
97 |
+
dist_threshold_low_slider = gr.Slider(
|
98 |
+
label="Minimum embedding distance threshold (equivalent position)",
|
99 |
+
minimum=0, maximum=1, value=default_parameters[3], step=0.05
|
100 |
+
)
|
101 |
+
dist_threshold_high_slider = gr.Slider(
|
102 |
+
label="Maximum embedding distance threshold (irrelevant position)",
|
103 |
+
minimum=0, maximum=1, value=default_parameters[4], step=0.05
|
104 |
+
)
|
105 |
+
|
106 |
+
submit_button = gr.Button("Process")
|
107 |
+
clear_button = gr.Button("Clear")
|
108 |
+
|
109 |
+
output_json = gr.JSON(label="Result")
|
110 |
+
|
111 |
+
# Ejemplos:
|
112 |
+
examples = gr.Examples(
|
113 |
+
examples=[
|
114 |
+
["Supermarket cashier", "Deli worker since 2021. Previously worked 2 months as a waiter in a tapas bar."] + default_parameters,
|
115 |
+
["Generative AI Engineer", cv_example] + default_parameters
|
116 |
+
],
|
117 |
+
inputs=[job_text_input, cv_text_input, req_experience_input, req_experience_unit, positions_cap_input, dist_threshold_low_slider, dist_threshold_high_slider]
|
118 |
+
)
|
119 |
+
|
120 |
+
# Botón "Procesar"
|
121 |
+
submit_button.click(
|
122 |
+
fn=process_cv,
|
123 |
+
inputs=[
|
124 |
+
job_text_input,
|
125 |
+
cv_text_input,
|
126 |
+
req_experience_input,
|
127 |
+
req_experience_unit,
|
128 |
+
positions_cap_input,
|
129 |
+
dist_threshold_low_slider,
|
130 |
+
dist_threshold_high_slider
|
131 |
+
],
|
132 |
+
outputs=output_json
|
133 |
+
)
|
134 |
+
|
135 |
+
# Botón "Limpiar"
|
136 |
+
clear_button.click(
|
137 |
+
fn=lambda: ("","",*default_parameters),
|
138 |
+
inputs=[],
|
139 |
+
outputs=[
|
140 |
+
job_text_input,
|
141 |
+
cv_text_input,
|
142 |
+
req_experience_input,
|
143 |
+
req_experience_unit,
|
144 |
+
positions_cap_input,
|
145 |
+
dist_threshold_low_slider,
|
146 |
+
dist_threshold_high_slider
|
147 |
+
]
|
148 |
+
)
|
149 |
+
|
150 |
+
# Footer
|
151 |
+
gr.Markdown("""
|
152 |
+
<footer>
|
153 |
+
<p>You can view the complete code for this app and the explanatory notebooks on
|
154 |
+
<a href='https://github.com/reddgr/procesador-de-curriculos-cv' target='_blank'>GitHub</a></p>
|
155 |
+
<p>© 2024 <a href='https://talkingtochatbots.com' target='_blank'>talkingtochatbots.com</a></p>
|
156 |
+
</footer>
|
157 |
+
""")
|
158 |
+
|
159 |
+
# Lanzar la aplicación:
|
160 |
+
if __name__ == "__main__":
|
161 |
+
interface.launch()
|
cv_examples/reddgr_cv.txt
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
www.linkedin.com/in/davidgonzalezromero (LinkedIn)
|
2 |
+
talkingtochatbots.com (Personal)
|
3 |
+
Top Skills
|
4 |
+
Information and CommunicationsTechnology (ICT)
|
5 |
+
Data Science
|
6 |
+
Artificial Intelligence (AI)
|
7 |
+
Languages
|
8 |
+
English (Full Professional)
|
9 |
+
French (Limited Working)
|
10 |
+
Spanish (Native or Bilingual)
|
11 |
+
Certifications
|
12 |
+
Watson Analytics - Level 1
|
13 |
+
Retail Industry Jumpstart
|
14 |
+
Data Science Foundations - Level 1
|
15 |
+
Generative AI Imaging: WhatCreative Pros Need to Know
|
16 |
+
Prompt Engineering: How to Talk tothe AIs
|
17 |
+
Honors-Awards
|
18 |
+
Desafío Entorno Pre Mercado 2023
|
19 |
+
David González Romero
|
20 |
+
ICT Engineer | Business Consultant | Licensed Financial ServicesProfessional | Web Publisher | Profile not suggested by AI | 499connections | reddgr
|
21 |
+
Greater Madrid Metropolitan Area
|
22 |
+
Summary
|
23 |
+
I am an Information and Communications Technology Engineer andBusiness Consultant with over 15 years of experience in enterprisesoftware solutions, consulting, business analytics, and data science,across multiple countries and cross-functional teams.Over the last two decades, I have enjoyed the privilege of travelingaround the world, teaming up with outstanding professionals,leading teams, developing business opportunities, and buildingand managing longstanding client relationships. Throughout myconsulting and client relationship management career, I haveprimarily served clients in the retail industry, financial services,telecommunications, and the public sector, developing skills andknowledge across diverse domains such as marketing, finance, riskmanagement, software engineering, and data science.In academia, I completed MSc studies in telecommunications,electrical, and computer engineering, with research in ubiquitouscomputing, the Internet of Things, and computer security. Currently,I'm pursuing a master's degree in Artificial Intelligence Applied toFinancial Markets, and managing Talking to Chatbots, a websitededicated to generative AI projects, popular culture, and education,available at https://talkingtochatbots.com.
|
24 |
+
Experience
|
25 |
+
Talking to Chatbots, by Reddgr
|
26 |
+
Web Publisher and Generative AI Researcher
|
27 |
+
October 2006 - Present (18 years 3 months)
|
28 |
+
Spain
|
29 |
+
Developed and managed personal projects on the Internet since 2006.Currently managing the internet domains https://talkingtochatbots.com(website) and https://reddgr.com (search engine and social media keyword:
|
30 |
+
Page 1 of 6
|
31 |
+
“reddgr” stands for “David González Romero network”). Talking to Chatbotsis a knowledge hub that compiles LLM prompts and curated conversations,serving as an entertainment and educational platform for AI hobbyists,learners, and professionals.Since 2023, active developer and contributor in open-source and proprietarygenerative AI platforms and communities (reddgr.com/gpts huggingface.com/reddgr)
|
32 |
+
Acoustic
|
33 |
+
Principal Consultant | Martech SaaS
|
34 |
+
June 2020 - May 2023 (3 years)
|
35 |
+
Spain
|
36 |
+
Advised retail companies on implementing profitable, competitive pricingstrategies and promotions backed by DemandTec software as a service(SaaS) solutions and data science. Primary focus on developing and leadingsuccessful client relationships. Dedicated to continuously improving Acousticproducts and ensuring that clients and prospects receive excellent servicefrom Acoustic’s team and business partners. This involved delivering andmanaging: consultative selling, employee recruitment, training and mentoring,solution implementations, SaaS managed services (data integration andmodeling), technical support, customer relationship management (CRM), andanalytics consulting services.Companies served by the team of consultants and account managers I ledinclude: leading Spanish retailer; Italian supermarket cooperative; multinationalretail company operating in the Middle East, Eastern Europe and Africa;leading Italian retail group; Swedish grocery retailer; online retailer operating inthe UK and Ireland; British retailer with multinational presence; Finland-basedmultinational retail company; leading Norwegian retailer; retail cooperativein the Nordic countries; major multinational retail group operating in SouthAmerica.Participated in or led pre-sales activities (RFI, RFP, POC, RFQ) for variousnational and multinational retailers based in Southern Europe, Central Europe,Nordics, the Middle East and Australia.
|
37 |
+
IBM
|
38 |
+
7 years 3 months
|
39 |
+
Engagement Manager, in support of Acoustic | B2B SaaS Retail Analytics
|
40 |
+
Page 2 of 6
|
41 |
+
July 2019 - May 2020 (11 months)
|
42 |
+
Madrid, Community of Madrid, Spain
|
43 |
+
Employed by IBM exclusively in support of Acoustic, new company founded in2019 by a team of IBM Watson Marketing & Commerce software specialists,led by former IBM executives and funded by private equity. Specialist inAcoustic Pricing and Promotion solutions (DemandTec), acting as AcousticSoftware Services team leader in Spain, and as software delivery EngagementManager and Subject Matter Expert for pre-sales and services projectsworldwide. Acoustic clients I worked with include: leading Spanish retailer, supermarketcooperative based in central and southern Italy, multinational retail companyoperating in Middle East, Eastern Europe and Africa, leading Italian retailgroup.
|
44 |
+
Engagement Manager | B2B SaaS Retail Analytics
|
45 |
+
September 2018 - June 2019 (10 months)
|
46 |
+
Madrid, Community of Madrid, Spain
|
47 |
+
Managing services projects and SaaS engagements for IBM WatsonCommerce solutions. As cognitive solutions specialist and SME in retail pricingand business analytics, I helped IBM clients succeed by coordinating allcomponents of the IBM Omni-Channel Pricing (DemandTec) cloud-basedsolution, including: solution design and PoC's, solution implementation anddelivery, data science services, data integration services, SaaS operations,technical support, product management, benefits assessments, and analyticalconsulting services.IBM clients I worked with as Engagement Manager or SME include: leadingSpanish retailer, multinational retail company operating in Middle East, EasternEurope and Africa, leading Italian retail group, Italian supermarket cooperative.
|
48 |
+
Relationship Manager | Cognitive Solutions SaaS
|
49 |
+
January 2015 - August 2018 (3 years 8 months)
|
50 |
+
Madrid, Community of Madrid, Spain
|
51 |
+
Specialist in the IBM Omni-channel Merchandising (DemandTec) solutionfor the retail industry, including Price Optimization, Promotion Planning andDynamic Pricing software.Managed the day to day relationship with assigned clients (€ 2 million ARR),prospecting and coordinating the delivery of SaaS platform enablementservices (data integration and data science), technical support, project
|
52 |
+
Page 3 of 6
|
53 |
+
management, and end-user enablement. Collaborated in other internationalprojects as DemandTec and pricing SME, delivering training and projectguidance to client end-users and business partners.IBM clients I worked with include: multiple Merchandising divisions of leadingSpanish retailer, multinational supermarket chain based in Spain, supermarketco-operative based in Denmark, multinational retail company operating inMiddle East, Eastern Europe and Africa, Russian supermarket chain, Finland-based retail company, British consumer co-operative.
|
54 |
+
Business Analyst | B2B SaaS Retail Analytics
|
55 |
+
March 2013 - December 2014 (1 year 10 months)
|
56 |
+
Madrid, Community of Madrid, Spain
|
57 |
+
Delivery of IBM Enterprise Marketing Management implementation projects,including DemandTec Price Optimization, Markdown Optimization andAssortment Optimization SaaS solutions. Delivered business and technicalguidance to pricing managers, category managers, buyers and businessconsultants in solution architecture, problem management resolution andchange management. Specialist in performing data analysis on the datascience, optimization and business analytics tools and services included in thesolution.IBM clients I worked with include: leading Spanish retailer, multinationalfashion retailer based in Spain, US-based sports retailer, supermarket co-operative based in Denmark.
|
58 |
+
KPMG España
|
59 |
+
Senior Consultant | Financial Risk Management
|
60 |
+
December 2010 - March 2013 (2 years 4 months)
|
61 |
+
Madrid, Community of Madrid, Spain
|
62 |
+
Senior Consultant in Financial Risk Management. Main projects:• Corporate and Investment Banking financial reporting: data mining andanalytics for Finance and Business Performance & Analytics department atleading multinational banking and financial services company. Developed andmaintained financial reports and insights for CFO, senior management andfront office.• Retail and Business Banking credit risk modeling: led user acceptancetesting and test case development for credit risk models and EBA-compliantreporting (COREP) of capital requirements. Led UAT development team and
|
63 |
+
Page 4 of 6
|
64 |
+
acted as a link between IT teams and Risk department. Supported credit riskmodeling team on early implementation of internal ratings-based (IRB) creditrisk models in compliance with Basel Framework on banking supervision.
|
65 |
+
MBD Analytics
|
66 |
+
Business Intelligence Consultant
|
67 |
+
February 2010 - December 2010 (11 months)
|
68 |
+
Alcobendas, Community of Madrid, Spain
|
69 |
+
Marketing Business Intelligence consulting services. Client-facing consultantfor Competitive Intelligence department at a multinational telecommunicationscompany. Responsible for the development of custom BI reporting solutionsand presenting monthly business reports. The reports included insights,analysis and forecasting of KPIs measuring customer activity and value inconsumer and enterprise telecommunication services.
|
70 |
+
Grupo Eneas
|
71 |
+
Cost Analyst
|
72 |
+
November 2009 - December 2009 (2 months)
|
73 |
+
Madrid, Community of Madrid, Spain
|
74 |
+
Telecommunications cost optimization project for a regional governmentagency of Spain. Gathered and analyzed invoice and contract data in supportof a Request for Quotation (RFQ) to a selection of telecommunication serviceproviders.
|
75 |
+
Deloitte España
|
76 |
+
IT Strategy Consultant
|
77 |
+
September 2008 - January 2009 (5 months)
|
78 |
+
Madrid, Community of Madrid, Spain
|
79 |
+
Management consulting intern, collaborating on IT management projects forinsurance, banking and public sector companies and institutions based inSpain. Collaborated on research, documentation, elaboration of proposals andtechnical support for IT management consulting projects, including: IT strategicplanning and market research, IT service management, IT integration, and ITcost optimization.
|
80 |
+
Education
|
81 |
+
Illinois Institute of Technology
|
82 |
+
Research Scholar, Electrical & Computer Engineering · (2009 - 2009)
|
83 |
+
Page 5 of 6
|
84 |
+
Universidad Politécnica de Madrid
|
85 |
+
Master of Science (MSc), Telecommunications Engineer · (2003 - 2009)
|
86 |
+
Instituto BME
|
87 |
+
Master's degree, Artificial Intelligence Applied to Financial Markets(MIAX) · (October 2023 - May 2025)
|
88 |
+
Page 6 of 6
|
json/ner_schema.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "object",
|
3 |
+
"properties": {
|
4 |
+
"experience": {
|
5 |
+
"type": "array",
|
6 |
+
"items": {
|
7 |
+
"type": "object",
|
8 |
+
"properties": {
|
9 |
+
"company": {"type": "string"},
|
10 |
+
"role": {"type": "string"},
|
11 |
+
"period": {
|
12 |
+
"type": "string",
|
13 |
+
"description": "'YYYYMM-YYYYMM' format or simply 'YYYYMM' if no end date is given."
|
14 |
+
}
|
15 |
+
},
|
16 |
+
"required": ["company", "role", "period"]
|
17 |
+
}
|
18 |
+
}
|
19 |
+
},
|
20 |
+
"required": ["experience"]
|
21 |
+
}
|
json/response_schema.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "object",
|
3 |
+
"properties": {
|
4 |
+
"score": {
|
5 |
+
"type": "number"
|
6 |
+
},
|
7 |
+
"relevant experience": {
|
8 |
+
"type": "array",
|
9 |
+
"items": {
|
10 |
+
"type": "object",
|
11 |
+
"properties": {
|
12 |
+
"company": {
|
13 |
+
"type": "string"
|
14 |
+
},
|
15 |
+
"role": {
|
16 |
+
"type": "string"
|
17 |
+
},
|
18 |
+
"duration": {
|
19 |
+
"type": "integer"
|
20 |
+
}
|
21 |
+
},
|
22 |
+
"required": [
|
23 |
+
"company",
|
24 |
+
"role",
|
25 |
+
"duration"
|
26 |
+
]
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"experience summary": {
|
30 |
+
"type": "string"
|
31 |
+
}
|
32 |
+
},
|
33 |
+
"required": [
|
34 |
+
"score",
|
35 |
+
"relevant experience",
|
36 |
+
"experience summary"
|
37 |
+
]
|
38 |
+
}
|
pkl/df_ejemplos_con_distancia.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41724c6196d4862a3b08d417024549c4f4f644fdc031d7eda324dc715fa14824
|
3 |
+
size 1623
|
pkl/df_experiencia.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:726aabfa90eddf805b685ab14231dc18fbdabf1033d7c4649dddcaa0f2936454
|
3 |
+
size 1501
|
prompts/ner_pre_prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
You are a resume processor that extracts job titles, company names, and their corresponding periods. Use JSON format in the output with the keys "empresa", "puesto", and "periodo". For the period, consider any date format or range of dates included in the text. An example of a date format in the input is "October 2023 / March 2024". The value for the "periodo" key should be a string with two elements in YYYYMM format separated by a hyphen, for example "202310-202403", or only one element if no end date is identified.
|
prompts/system_prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
You are a curriculum vitae processor that receives a job offer, a filtered curriculum vitae, relevant previous experience, a precalculated score for the curriculum between 0 and 100, and a required experience parameter in months. The score has been calculated using an algorithm that utilizes embedding distances between each role and the job offer's definition, as well as the duration of each role and its relationship with the required experience parameter. It returns an object with a predefined schema, including exactly the provided score, the given list of experiences, and additionally a brief explanatory text about the candidate's experience and why they have obtained the given score. It is important that the explanatory text is coherent with the score. For example, if the score is greater than 80, the explanatory text should emphasize the candidate's past experiences and the duration of those experiences that have led to that score. When mentioning any experience duration exceeding 12 months, include in the text only an approximate conversion to years, as the exact data is provided in the attached experience list.
|
prompts/user_prompt.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The vacancy title is: {job}. The required experience in months is {req_experience}. The score is {puntuacion}. The relevant experience is: {exp}. Explain why the score was obtained.
|
src/__pycache__/procesador_de_cvs_con_llm.cpython-311.pyc
ADDED
Binary file (17 kB). View file
|
|
src/procesador_de_cvs_con_llm.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import textwrap
|
6 |
+
from scipy import spatial
|
7 |
+
from datetime import datetime
|
8 |
+
from openai import OpenAI
|
9 |
+
|
10 |
+
class ProcesadorCV:
|
11 |
+
|
12 |
+
def __init__(self, api_key, cv_text, job_text, ner_pre_prompt, system_prompt, user_prompt, ner_schema, response_schema,
|
13 |
+
inference_model="gpt-4o-mini", embeddings_model="text-embedding-3-small"):
|
14 |
+
"""
|
15 |
+
Initializes an instance of the class with the provided parameters.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
api_key (str): The API key to authenticate with the OpenAI client.
|
19 |
+
cv_text (str): CV content in text format.
|
20 |
+
job_text (str): title of the job offer to evaluate.
|
21 |
+
ner_pre_prompt (str): "Named Entity Recognition" (NER) instruction for the natural language model.
|
22 |
+
system_prompt (str): natural language instruction for the final structured output.
|
23 |
+
user_prompt (str): instruction with parameters and data calculated in preprocessing.
|
24 |
+
ner_schema (dict): schema for the "structured outputs" call to the OpenAI model for NER.
|
25 |
+
response_schema (dict): schema for the final application response.
|
26 |
+
inference_model (str, optional): The inference model to use. Default is "gpt-4o-mini".
|
27 |
+
embeddings_model (str, optional): The embeddings model to use. Default is "text-embedding-3-small".
|
28 |
+
|
29 |
+
Attributes:
|
30 |
+
inference_model (str): Stores the selected inference model.
|
31 |
+
embeddings_model (str): Stores the selected embeddings model.
|
32 |
+
client (OpenAI): Instance of the OpenAI client initialized with the provided API key.
|
33 |
+
cv (str): Stores the provided curriculum vitae text.
|
34 |
+
"""
|
35 |
+
self.inference_model = inference_model
|
36 |
+
self.embeddings_model = embeddings_model
|
37 |
+
self.ner_pre_prompt = ner_pre_prompt
|
38 |
+
self.user_prompt = user_prompt
|
39 |
+
self.system_prompt = system_prompt
|
40 |
+
self.ner_schema = ner_schema
|
41 |
+
self.response_schema = response_schema
|
42 |
+
self.client = OpenAI(api_key=api_key)
|
43 |
+
self.cv = cv_text
|
44 |
+
self.job_text = job_text
|
45 |
+
print("Cliente inicializado como",self.client)
|
46 |
+
|
47 |
+
def extraer_datos_cv(self, temperature=0.5):
|
48 |
+
"""
|
49 |
+
Extracts structured data from a CV using OpenAI API.
|
50 |
+
Args:
|
51 |
+
temperature (float, optional): temperature value for the language model. Default is 0.5.
|
52 |
+
Returns:
|
53 |
+
pd.DataFrame: DataFrame with structured data extracted from the CV.
|
54 |
+
Raises:
|
55 |
+
ValueError: if structured data cannot be extracted from the CV.
|
56 |
+
"""
|
57 |
+
response = self.client.chat.completions.create(
|
58 |
+
model=self.inference_model,
|
59 |
+
temperature=temperature,
|
60 |
+
messages=[
|
61 |
+
{"role": "system", "content": self.ner_pre_prompt},
|
62 |
+
{"role": "user", "content": self.cv}
|
63 |
+
],
|
64 |
+
functions=[
|
65 |
+
{
|
66 |
+
"name": "extraer_datos_cv",
|
67 |
+
"description": "Extracts table with job titles, company names and periods from a CV.",
|
68 |
+
"parameters": self.ner_schema
|
69 |
+
}
|
70 |
+
],
|
71 |
+
function_call="auto"
|
72 |
+
)
|
73 |
+
|
74 |
+
if response.choices[0].message.function_call:
|
75 |
+
function_call = response.choices[0].message.function_call
|
76 |
+
structured_output = json.loads(function_call.arguments)
|
77 |
+
if structured_output.get("experience"):
|
78 |
+
df_cv = pd.DataFrame(structured_output["experience"])
|
79 |
+
return df_cv
|
80 |
+
else:
|
81 |
+
raise ValueError(f"Unable to extract structured data: {response.choices[0].message.content}")
|
82 |
+
else:
|
83 |
+
raise ValueError(f"Unable to extract structured data: {response.choices[0].message.content}")
|
84 |
+
|
85 |
+
|
86 |
+
def procesar_periodos(self, df):
|
87 |
+
"""
|
88 |
+
Process periods in a DataFrame and adds columns with start dates, end dates, and duration in months.
|
89 |
+
If there is no end date, the current date is considered.
|
90 |
+
df (pandas.DataFrame): DataFrame containing a 'period' column with periods in 'YYYYMM-YYYYMM' or 'YYYYMM' format.
|
91 |
+
pandas.DataFrame: DataFrame with additional columns 'fec_inicio', 'fec_final', and 'duracion'.
|
92 |
+
- 'fec_inicio' (datetime.date): Start date of the period.
|
93 |
+
- 'fec_final' (datetime.date): End date of the period.
|
94 |
+
- 'duracion' (int): Duration of the period in months.
|
95 |
+
"""
|
96 |
+
# Función lambda para procesar el período
|
97 |
+
def split_period(period):
|
98 |
+
dates = period.split('-')
|
99 |
+
start_date = datetime.strptime(dates[0], "%Y%m")
|
100 |
+
if len(dates) > 1:
|
101 |
+
end_date = datetime.strptime(dates[1], "%Y%m")
|
102 |
+
else:
|
103 |
+
end_date = datetime.now()
|
104 |
+
return start_date, end_date
|
105 |
+
|
106 |
+
df[['fec_inicio', 'fec_final']] = df['period'].apply(lambda x: pd.Series(split_period(x)))
|
107 |
+
|
108 |
+
# Formateamos las fechas para mostrar mes, año, y el primer día del mes (dado que el día es irrelevante y no se suele especificar)
|
109 |
+
df['fec_inicio'] = df['fec_inicio'].dt.date
|
110 |
+
df['fec_final'] = df['fec_final'].dt.date
|
111 |
+
|
112 |
+
# Añadimos una columna con la duración en meses
|
113 |
+
df['duracion'] = df.apply(
|
114 |
+
lambda row: (row['fec_final'].year - row['fec_inicio'].year) * 12 +
|
115 |
+
row['fec_final'].month - row['fec_inicio'].month,
|
116 |
+
axis=1
|
117 |
+
)
|
118 |
+
|
119 |
+
return df
|
120 |
+
|
121 |
+
|
122 |
+
def calcular_embeddings(self, df, column='role', model_name='text-embedding-3-small'):
|
123 |
+
"""
|
124 |
+
Calculates the embeddings for a column in a dataframe using the OpenAI API.
|
125 |
+
|
126 |
+
df (pandas.DataFrame): DataFrame containing the CV data.
|
127 |
+
column (str, optional): Name of the column containing the text to be converted to embeddings. Default is 'role'.
|
128 |
+
model_name (str, optional): Name of the embeddings model. Default is 'text-embedding-3-small'.
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
pandas.DataFrame: DataFrame with an additional 'embeddings' column containing the generated embeddings.
|
132 |
+
"""
|
133 |
+
df['embeddings'] = df[column].apply(
|
134 |
+
lambda puesto: self.client.embeddings.create(
|
135 |
+
input=puesto,
|
136 |
+
model=model_name
|
137 |
+
).data[0].embedding
|
138 |
+
)
|
139 |
+
return df
|
140 |
+
|
141 |
+
|
142 |
+
def calcular_distancias(self, df, column='embeddings', model_name='text-embedding-3-small'):
|
143 |
+
"""
|
144 |
+
Calculates the cosine distance between the text embeddings and those included in a DataFrame column.
|
145 |
+
Parameters:
|
146 |
+
-----------
|
147 |
+
df : pandas.DataFrame
|
148 |
+
DataFrame containing the embeddings.
|
149 |
+
column : str, optional
|
150 |
+
Name of the DataFrame column containing the embeddings. Default is 'embeddings'.
|
151 |
+
model_name : str, optional
|
152 |
+
OpenAI API embedding model. Default is "text-embedding-3-small".
|
153 |
+
--------
|
154 |
+
pandas.DataFrame
|
155 |
+
DataFrame sorted by distance in ascending order, with distances added as a new column.
|
156 |
+
"""
|
157 |
+
response = self.client.embeddings.create(
|
158 |
+
input=self.job_text,
|
159 |
+
model=model_name
|
160 |
+
)
|
161 |
+
emb_compare = response.data[0].embedding
|
162 |
+
|
163 |
+
df['distancia'] = df[column].apply(lambda emb: spatial.distance.cosine(emb, emb_compare))
|
164 |
+
df.drop(columns=[column], inplace=True)
|
165 |
+
df.sort_values(by='distancia', ascending=True, inplace=True)
|
166 |
+
return df
|
167 |
+
|
168 |
+
|
169 |
+
def calcular_puntuacion(self, df, req_experience, positions_cap=4, dist_threshold_low=0.6, dist_threshold_high=0.7):
|
170 |
+
"""
|
171 |
+
|
172 |
+
Calculates the score of a CV based on its distance table (relative to a given position) and durations.
|
173 |
+
Parameters:
|
174 |
+
----------
|
175 |
+
df : pandas.DataFrame
|
176 |
+
CV data including different experiences with durations and distances previously calculated based on the embeddings of a job position.
|
177 |
+
req_experience : float
|
178 |
+
Required experience in months for the job position (reference value to calculate a score between 0 and 100 based on different experiences).
|
179 |
+
positions_cap : int, optional
|
180 |
+
Maximum number of positions to consider for scoring. Defaults to 4.
|
181 |
+
dist_threshold_low : float, optional
|
182 |
+
Distance between embeddings below which the CV position is considered "equivalent" to the job offer. Defaults to 0.6.
|
183 |
+
dist_threshold_high : float, optional
|
184 |
+
Distance between embeddings above which the CV position does not score. Defaults to 0.7.
|
185 |
+
-------
|
186 |
+
pandas.DataFrame
|
187 |
+
Original DataFrame with an additional column containing individual scores contributed by each position.
|
188 |
+
float
|
189 |
+
Total score between 0 and 100.
|
190 |
+
"""
|
191 |
+
# A efectos de puntuación, computamos para cada puesto como máximo el número total de meses de experiencia requeridos
|
192 |
+
df['duration_capped'] = df['duracion'].apply(lambda x: min(x, req_experience))
|
193 |
+
# Normalizamos la distancia entre 0 y 1, siendo 0 la distancia mínima y 1 la máxima
|
194 |
+
df['adjusted_distance'] = df['distancia'].apply(
|
195 |
+
lambda x: 0 if x <= dist_threshold_low else (
|
196 |
+
1 if x >= dist_threshold_high else (x - dist_threshold_low) / (dist_threshold_high - dist_threshold_low)
|
197 |
+
)
|
198 |
+
)
|
199 |
+
# Cada puesto puntúa en base a su duración y a la inversa de la distancia (a menor distancia, mayor puntuación)
|
200 |
+
df['position_score'] = round(((1 - df['adjusted_distance']) * (df['duration_capped']/req_experience) * 100), 2)
|
201 |
+
# Descartamos puestos con distancia superior al umbral definido (asignamos puntuación 0), y ordenamos por puntuación
|
202 |
+
df.loc[df['distancia'] >= dist_threshold_high, 'position_score'] = 0
|
203 |
+
df = df.sort_values(by='position_score', ascending=False)
|
204 |
+
# Nos quedamos con los puestos con mayor puntuación (positions_cap)
|
205 |
+
df.iloc[positions_cap:, df.columns.get_loc('position_score')] = 0
|
206 |
+
# Totalizamos (no debería superar 100 nunca, pero ponemos un límite para asegurar) y redondeamos a dos decimales
|
207 |
+
total_score = round(min(df['position_score'].sum(), 100), 2)
|
208 |
+
return df, total_score
|
209 |
+
|
210 |
+
def filtra_experiencia_relevante(self, df):
|
211 |
+
"""
|
212 |
+
Filters the relevant experiences from the dataframe and returns them in dictionary format.
|
213 |
+
Args:
|
214 |
+
df (pandas.DataFrame): DataFrame with complete experience information.
|
215 |
+
Returns:
|
216 |
+
dict: Dictionary with the relevant experiences.
|
217 |
+
"""
|
218 |
+
df_experiencia = df[df['position_score'] > 0].copy()
|
219 |
+
df_experiencia.drop(columns=['period', 'fec_inicio', 'fec_final',
|
220 |
+
'distancia', 'duration_capped', 'adjusted_distance'], inplace=True)
|
221 |
+
experiencia_dict = df_experiencia.to_dict(orient='list')
|
222 |
+
return experiencia_dict
|
223 |
+
|
224 |
+
def llamada_final(self, req_experience, puntuacion, dict_experiencia):
|
225 |
+
"""
|
226 |
+
Makes the final call to the language model to generate the final response.
|
227 |
+
req_experience (int): Required experience in months for the job position.
|
228 |
+
puntuacion (float): Total score of the CV.
|
229 |
+
dict_experiencia (dict): Dictionary with relevant experiences.
|
230 |
+
dict: Dictionary with the final response.
|
231 |
+
Raises:
|
232 |
+
ValueError: If no response is generated by the language model.
|
233 |
+
"""
|
234 |
+
messages = [
|
235 |
+
{
|
236 |
+
"role": "system",
|
237 |
+
"content": self.system_prompt
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"role": "user",
|
241 |
+
"content": self.user_prompt.format(job=self.job_text, req_experience=req_experience,puntuacion=puntuacion, exp=dict_experiencia)
|
242 |
+
}
|
243 |
+
]
|
244 |
+
|
245 |
+
functions = [
|
246 |
+
{
|
247 |
+
"name": "respuesta_formateada",
|
248 |
+
"description": "Returns an object with score, experience and description of the experience",
|
249 |
+
"parameters": self.response_schema
|
250 |
+
}
|
251 |
+
]
|
252 |
+
|
253 |
+
response = self.client.chat.completions.create(
|
254 |
+
model=self.inference_model,
|
255 |
+
temperature=0.5,
|
256 |
+
messages=messages,
|
257 |
+
functions=functions,
|
258 |
+
function_call={"name": "respuesta_formateada"}
|
259 |
+
)
|
260 |
+
|
261 |
+
if response.choices[0].message.function_call:
|
262 |
+
function_call = response.choices[0].message.function_call
|
263 |
+
structured_output = json.loads(function_call.arguments)
|
264 |
+
print("Response:\n", json.dumps(structured_output, indent=4, ensure_ascii=False))
|
265 |
+
wrapped_description = textwrap.fill(structured_output['experience summary'], width=120)
|
266 |
+
print(f"Experience summary:\n{wrapped_description}")
|
267 |
+
return structured_output
|
268 |
+
else:
|
269 |
+
raise ValueError(f"Error. No response was generated:\n {response.choices[0].message.content}")
|
270 |
+
|
271 |
+
def procesar_cv_completo(self, req_experience, positions_cap, dist_threshold_low, dist_threshold_high):
|
272 |
+
'''
|
273 |
+
Processes a CV and calculates the final score.
|
274 |
+
req_experience (int, optional): Required experience in months for the job position.
|
275 |
+
positions_cap (int, optional): Maximum number of positions to consider for scoring.
|
276 |
+
dist_threshold_low (float, optional): Distance limit to consider a position equivalent.
|
277 |
+
dist_threshold_high (float, optional): Distance limit to consider a position not relevant.
|
278 |
+
Returns:
|
279 |
+
dict: Dictionary with the final answer.
|
280 |
+
'''
|
281 |
+
df_datos_estructurados_cv = self.extraer_datos_cv()
|
282 |
+
df_datos_estructurados_cv = self.procesar_periodos(df_datos_estructurados_cv)
|
283 |
+
df_con_embeddings = self.calcular_embeddings(df_datos_estructurados_cv)
|
284 |
+
df_con_distancias = self.calcular_distancias(df_con_embeddings)
|
285 |
+
df_puntuaciones, puntuacion = self.calcular_puntuacion(df_con_distancias,
|
286 |
+
req_experience=req_experience,
|
287 |
+
positions_cap=positions_cap,
|
288 |
+
dist_threshold_low=dist_threshold_low,
|
289 |
+
dist_threshold_high=dist_threshold_high)
|
290 |
+
dict_experiencia = self.filtra_experiencia_relevante(df_puntuaciones)
|
291 |
+
dict_respuesta = self.llamada_final(req_experience, puntuacion, dict_experiencia)
|
292 |
+
return dict_respuesta
|