Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +79 -0
- comparison_utils.py +104 -0
- job_description_extractor.py +105 -0
- model_trainer.py +37 -0
- requirements.txt +8 -0
- resume_extractor.py +147 -0
- synthetic_data.py +29 -0
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File
|
2 |
+
import pandas as pd
|
3 |
+
from resume_extractor import ResumeExtractor
|
4 |
+
from job_description_extractor import JobDescriptionExtractor
|
5 |
+
from model_trainer import ModelTrainer
|
6 |
+
from comparison_utils import (
|
7 |
+
compare_with_chatgpt_job_title,
|
8 |
+
compare_with_chatgpt_education,
|
9 |
+
compare_with_chatgpt_location,
|
10 |
+
compare_age_range_with_description
|
11 |
+
)
|
12 |
+
from synthetic_data import create_synthetic_data
|
13 |
+
|
14 |
+
app = FastAPI()
|
15 |
+
|
16 |
+
def main(resume_text, job_description):
|
17 |
+
openai_api_key = 'sk-proj-bC6H6QrP6DUqHkn5vOkYT3BlbkFJsSyvL4Bc9c3UEbHrsPMj'
|
18 |
+
ner_model_name_or_path = "NLPclass/Named-entity-recognition"
|
19 |
+
skill_model_name_or_path = "GalalEwida/lm-ner-skills-recognition"
|
20 |
+
|
21 |
+
resume_extractor = ResumeExtractor(ner_model_name_or_path, openai_api_key)
|
22 |
+
job_description_extractor = JobDescriptionExtractor(openai_api_key)
|
23 |
+
|
24 |
+
full_name, loc, age, skills, education_resume, title_job_resume = resume_extractor.extract_resume_info(resume_text, skill_model_name_or_path)
|
25 |
+
job_skills, education_job, title_job, location, age_DS = job_description_extractor.extract_job_info(job_description, skill_model_name_or_path)
|
26 |
+
|
27 |
+
education_match = compare_with_chatgpt_education(education_resume, education_job, openai_api_key)
|
28 |
+
title_job_match = compare_with_chatgpt_job_title(title_job_resume, title_job, openai_api_key)
|
29 |
+
title_loc_match = compare_with_chatgpt_location(loc, location, openai_api_key)
|
30 |
+
title_age_match = compare_age_range_with_description(age, age_DS, openai_api_key)
|
31 |
+
|
32 |
+
synthetic_data = create_synthetic_data(job_skills, education_job, title_job, location, age_DS)
|
33 |
+
synthetic_data.to_csv('synthetic_data.csv')
|
34 |
+
model_trainer = ModelTrainer(synthetic_data)
|
35 |
+
best_model = model_trainer.train_models()
|
36 |
+
|
37 |
+
input_data = {skill: 1 if skill in skills else 0 for skill in job_skills}
|
38 |
+
input_data[education_job] = education_match
|
39 |
+
input_data[title_job] = title_job_match
|
40 |
+
input_data[location] = title_loc_match
|
41 |
+
input_data[age_DS] = title_age_match
|
42 |
+
|
43 |
+
input_df = pd.DataFrame([input_data])
|
44 |
+
input_df.to_csv('input_df.csv')
|
45 |
+
predicted_target = best_model.predict(input_df)
|
46 |
+
|
47 |
+
return {
|
48 |
+
"full_name": full_name,
|
49 |
+
"location": loc,
|
50 |
+
"age": age,
|
51 |
+
"age_DS": age_DS,
|
52 |
+
"skills": skills,
|
53 |
+
"education_resume": education_resume,
|
54 |
+
"title_job_resume": title_job_resume,
|
55 |
+
"job_skills": job_skills,
|
56 |
+
"education_job": education_job,
|
57 |
+
"title_job": title_job,
|
58 |
+
"location_job": location,
|
59 |
+
"predicted_target": predicted_target[0]
|
60 |
+
}
|
61 |
+
|
62 |
+
@app.post("/extract")
|
63 |
+
async def extract(resume_file: UploadFile = File(...), job_description_file: UploadFile = File(...)):
|
64 |
+
try:
|
65 |
+
resume_text = await resume_file.read()
|
66 |
+
job_description = await job_description_file.read()
|
67 |
+
|
68 |
+
# Convert bytes to string
|
69 |
+
resume_text = resume_text.decode('utf-8')
|
70 |
+
job_description = job_description.decode('utf-8')
|
71 |
+
|
72 |
+
output = main(resume_text, job_description)
|
73 |
+
return output
|
74 |
+
except Exception as e:
|
75 |
+
raise HTTPException(status_code=500, detail=str(e))
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
import uvicorn
|
79 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
comparison_utils.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import re
|
3 |
+
def compare_with_chatgpt_job_title(text1, text2, openai_api_key):
|
4 |
+
openai.api_key = openai_api_key
|
5 |
+
prompt = f"Compare the following two texts and determine if they match in job title . Return 1 for match and 0 for no match.\n\nText 1: {text1}\n\nText 2: {text2}"
|
6 |
+
|
7 |
+
response = openai.ChatCompletion.create(
|
8 |
+
model="gpt-3.5-turbo",
|
9 |
+
messages=[
|
10 |
+
{"role": "system", "content": "You are an assistant that helps compare texts for matching job titles "},
|
11 |
+
{"role": "user", "content": prompt}
|
12 |
+
],
|
13 |
+
max_tokens=100
|
14 |
+
)
|
15 |
+
|
16 |
+
# Extract the response content
|
17 |
+
result = response.choices[0].message['content'].strip()
|
18 |
+
|
19 |
+
# Check if the response contains '1' or '0' and return the corresponding integer
|
20 |
+
if '1' in result:
|
21 |
+
return 1
|
22 |
+
elif '0' in result:
|
23 |
+
return 0
|
24 |
+
else:
|
25 |
+
raise ValueError(f"Unexpected response: {result}")
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
def compare_with_chatgpt_education(text1, text2, openai_api_key):
|
31 |
+
openai.api_key = openai_api_key
|
32 |
+
prompt = f"Compare the following two texts and determine if they match in education . Return 1 for match and 0 for no match.\n\nText 1: {text1}\n\nText 2: {text2}"
|
33 |
+
|
34 |
+
response = openai.ChatCompletion.create(
|
35 |
+
model="gpt-3.5-turbo",
|
36 |
+
messages=[
|
37 |
+
{"role": "system", "content": "You are an assistant that helps compare texts for matching education "},
|
38 |
+
{"role": "user", "content": prompt}
|
39 |
+
],
|
40 |
+
max_tokens=100
|
41 |
+
)
|
42 |
+
|
43 |
+
# Extract the response content
|
44 |
+
result = response.choices[0].message['content'].strip()
|
45 |
+
|
46 |
+
# Check if the response contains '1' or '0' and return the corresponding integer
|
47 |
+
if '1' in result:
|
48 |
+
return 1
|
49 |
+
elif '0' in result:
|
50 |
+
return 0
|
51 |
+
else:
|
52 |
+
raise ValueError(f"Unexpected response: {result}")
|
53 |
+
|
54 |
+
|
55 |
+
def compare_with_chatgpt_location(text1, text2, openai_api_key):
|
56 |
+
openai.api_key = openai_api_key
|
57 |
+
prompt = f"Compare the following two texts and determine if they match in location . Return 1 for match and 0 for no match.\n\nText 1: {text1}\n\nText 2: {text2}"
|
58 |
+
|
59 |
+
response = openai.ChatCompletion.create(
|
60 |
+
model="gpt-3.5-turbo",
|
61 |
+
messages=[
|
62 |
+
{"role": "system", "content": "You are an assistant that helps compare texts for matching location "},
|
63 |
+
{"role": "user", "content": prompt}
|
64 |
+
],
|
65 |
+
max_tokens=100
|
66 |
+
)
|
67 |
+
|
68 |
+
# Extract the response content
|
69 |
+
result = response.choices[0].message['content'].strip()
|
70 |
+
|
71 |
+
# Check if the response contains '1' or '0' and return the corresponding integer
|
72 |
+
if '1' in result:
|
73 |
+
return 1
|
74 |
+
elif '0' in result:
|
75 |
+
return 0
|
76 |
+
else:
|
77 |
+
raise ValueError(f"Unexpected response: {result}")
|
78 |
+
|
79 |
+
|
80 |
+
def compare_age_range_with_description(age, age_DS, openai_api_key):
|
81 |
+
openai.api_key = openai_api_key
|
82 |
+
|
83 |
+
prompt = (f"Check if the age {age} falls within the age range '{age_DS}' "
|
84 |
+
f"Return '1' if it falls within the range, otherwise return '0'.\n\n"
|
85 |
+
f"Age: {age}\n\n"
|
86 |
+
f"Age Range: {age_DS}")
|
87 |
+
|
88 |
+
response = openai.ChatCompletion.create(
|
89 |
+
model="gpt-3.5-turbo",
|
90 |
+
messages=[
|
91 |
+
{"role": "system", "content": "You are an assistant that helps compare age ranges with a given age."},
|
92 |
+
{"role": "user", "content": prompt}
|
93 |
+
],
|
94 |
+
max_tokens=100
|
95 |
+
)
|
96 |
+
|
97 |
+
result = response.choices[0].message['content'].strip()
|
98 |
+
|
99 |
+
# استفاده از regex برای پیدا کردن '1' یا '0'
|
100 |
+
match = re.search(r"\b[01]\b", result)
|
101 |
+
if match:
|
102 |
+
return int(match.group())
|
103 |
+
else:
|
104 |
+
raise ValueError(f"Unexpected response: {result}")
|
job_description_extractor.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import openai
|
3 |
+
import re
|
4 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
5 |
+
import torch
|
6 |
+
class JobDescriptionExtractor:
|
7 |
+
def __init__(self, openai_api_key):
|
8 |
+
openai.api_key = openai_api_key
|
9 |
+
|
10 |
+
|
11 |
+
def extract_skills(self, text, skill_model_name_or_path):
|
12 |
+
skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name_or_path)
|
13 |
+
skill_model = AutoModelForTokenClassification.from_pretrained(skill_model_name_or_path)
|
14 |
+
inputs = skill_tokenizer(text, return_tensors="pt")
|
15 |
+
with torch.no_grad():
|
16 |
+
outputs = skill_model(**inputs)
|
17 |
+
logits = outputs.logits
|
18 |
+
predictions = torch.argmax(logits, dim=2)
|
19 |
+
tokens = skill_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
20 |
+
tags = [skill_model.config.id2label[p.item()] for p in predictions[0]]
|
21 |
+
skills = []
|
22 |
+
temp_skill = ""
|
23 |
+
for token, tag in zip(tokens, tags):
|
24 |
+
if tag == "B-TECHNOLOGY":
|
25 |
+
if temp_skill:
|
26 |
+
skills.append(temp_skill.strip())
|
27 |
+
temp_skill = ""
|
28 |
+
skills.append(token)
|
29 |
+
elif tag == "B-TECHNICAL":
|
30 |
+
if temp_skill:
|
31 |
+
skills.append(temp_skill.strip())
|
32 |
+
temp_skill = ""
|
33 |
+
temp_skill = token
|
34 |
+
elif tag == "I-TECHNICAL":
|
35 |
+
temp_skill += token.replace('##', '')
|
36 |
+
if temp_skill:
|
37 |
+
skills.append(temp_skill.strip())
|
38 |
+
return list(set(skills))
|
39 |
+
|
40 |
+
def translate_text(self, text, target_language="en"):
|
41 |
+
response = openai.ChatCompletion.create(
|
42 |
+
model="gpt-3.5-turbo",
|
43 |
+
messages=[
|
44 |
+
{"role": "system", "content": "You are a helpful assistant that translates text."},
|
45 |
+
{"role": "user", "content": f"Translate the following text to {target_language}:\n\n{text}"}
|
46 |
+
],
|
47 |
+
max_tokens=1000
|
48 |
+
)
|
49 |
+
return response.choices[0].message["content"].strip()
|
50 |
+
def extract_location(self, job_description):
|
51 |
+
response = openai.ChatCompletion.create(
|
52 |
+
model="gpt-3.5-turbo",
|
53 |
+
messages=[
|
54 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
55 |
+
{"role": "user", "content": f"Extract location from the following job description:\n\n{job_description}"}
|
56 |
+
],
|
57 |
+
max_tokens=1000
|
58 |
+
)
|
59 |
+
return response.choices[0].message["content"].strip()
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
def title(self, text):
|
64 |
+
response = openai.ChatCompletion.create(
|
65 |
+
model="gpt-3.5-turbo",
|
66 |
+
messages=[
|
67 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
68 |
+
{"role": "user", "content": f"Extract the [Last Job Title] from the following text:\n\n{text}"}
|
69 |
+
],
|
70 |
+
max_tokens=1000
|
71 |
+
)
|
72 |
+
return response.choices[0].message["content"].strip()
|
73 |
+
|
74 |
+
def extract_education(self, text):
|
75 |
+
response = openai.ChatCompletion.create(
|
76 |
+
model="gpt-3.5-turbo",
|
77 |
+
messages=[
|
78 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
79 |
+
{"role": "user", "content": f"Extract the [Highest Education Degree] from the following text:\n\n{text}"}
|
80 |
+
],
|
81 |
+
max_tokens=1000
|
82 |
+
)
|
83 |
+
return response.choices[0].message["content"].strip()
|
84 |
+
def extract_age_range(self, text):
|
85 |
+
response = openai.ChatCompletion.create(
|
86 |
+
model="gpt-3.5-turbo",
|
87 |
+
messages=[
|
88 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
89 |
+
{"role": "user", "content": f"Extract the age range from the following text:\n\n{text}"}
|
90 |
+
],
|
91 |
+
max_tokens=1000
|
92 |
+
)
|
93 |
+
return response.choices[0].message["content"].strip()
|
94 |
+
|
95 |
+
pass
|
96 |
+
|
97 |
+
def extract_job_info(self, job_description, skill_model_name_or_path):
|
98 |
+
# تابع استخراج اطلاعات کلی از توصیف شغلی
|
99 |
+
translated_job_description = self.translate_text(job_description)
|
100 |
+
job_skills = self.extract_skills(translated_job_description, skill_model_name_or_path)
|
101 |
+
education_job = self.extract_education(translated_job_description)
|
102 |
+
title_job = self.title(translated_job_description)
|
103 |
+
location = self.extract_location(translated_job_description)
|
104 |
+
age_DS = self.extract_age_range(translated_job_description)
|
105 |
+
return job_skills, education_job, title_job, location, age_DS
|
model_trainer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.model_selection import train_test_split
|
2 |
+
from sklearn.svm import SVR
|
3 |
+
from sklearn.ensemble import RandomForestRegressor
|
4 |
+
from sklearn.linear_model import LinearRegression, Lasso
|
5 |
+
from sklearn.metrics import r2_score
|
6 |
+
|
7 |
+
class ModelTrainer:
|
8 |
+
def __init__(self, dataframe):
|
9 |
+
self.dataframe = dataframe
|
10 |
+
|
11 |
+
def train_models(self):
|
12 |
+
features = list(self.dataframe.columns[:-1])
|
13 |
+
X = self.dataframe[features]
|
14 |
+
y = self.dataframe['TARGET']
|
15 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
16 |
+
|
17 |
+
models = {
|
18 |
+
"SVR": SVR(),
|
19 |
+
"RandomForest": RandomForestRegressor(),
|
20 |
+
"LinearRegression": LinearRegression(),
|
21 |
+
"Lasso": Lasso()
|
22 |
+
}
|
23 |
+
|
24 |
+
best_model = None
|
25 |
+
best_score = float('-inf')
|
26 |
+
|
27 |
+
for name, model in models.items():
|
28 |
+
model.fit(X_train, y_train)
|
29 |
+
y_pred = model.predict(X_test)
|
30 |
+
score = r2_score(y_test, y_pred)
|
31 |
+
print(f"{name} R2 Score: {score}")
|
32 |
+
if score > best_score:
|
33 |
+
best_score = score
|
34 |
+
best_model = model
|
35 |
+
|
36 |
+
print(f"Best Model: {best_model}")
|
37 |
+
return best_model
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
pandas
|
4 |
+
resume_extractor
|
5 |
+
job_description_extractor
|
6 |
+
model_trainer
|
7 |
+
comparison_utils
|
8 |
+
synthetic_data
|
resume_extractor.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import openai
|
3 |
+
import re
|
4 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
5 |
+
import torch
|
6 |
+
class ResumeExtractor:
|
7 |
+
def __init__(self, ner_model_name_or_path, openai_api_key):
|
8 |
+
self.ner_model_name_or_path = ner_model_name_or_path
|
9 |
+
self.tokenizer = AutoTokenizer.from_pretrained(ner_model_name_or_path)
|
10 |
+
self.model = AutoModelForTokenClassification.from_pretrained(ner_model_name_or_path)
|
11 |
+
self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
|
12 |
+
openai.api_key = openai_api_key
|
13 |
+
|
14 |
+
def calculate_age(self, date_string):
|
15 |
+
current_year = 1403
|
16 |
+
ymd_match = re.match(r'(\d{1,4})/(\d{1,2})/(\d{1,2})', date_string)
|
17 |
+
if ymd_match:
|
18 |
+
year = int(ymd_match.group(1))
|
19 |
+
if len(ymd_match.group(1)) == 4:
|
20 |
+
age = current_year - year
|
21 |
+
else:
|
22 |
+
year += 1300
|
23 |
+
age = current_year - year
|
24 |
+
return age
|
25 |
+
four_digit_match = re.match(r'(13\d{2})', date_string)
|
26 |
+
if four_digit_match:
|
27 |
+
year = int(four_digit_match.group(1))
|
28 |
+
age = current_year - year
|
29 |
+
return age
|
30 |
+
return None
|
31 |
+
|
32 |
+
def translate_text(self, text, target_language="en"):
|
33 |
+
response = openai.ChatCompletion.create(
|
34 |
+
model="gpt-3.5-turbo",
|
35 |
+
messages=[
|
36 |
+
{"role": "system", "content": "You are a helpful assistant that translates text."},
|
37 |
+
{"role": "user", "content": f"Translate the following text to {target_language}:\n\n{text}"}
|
38 |
+
],
|
39 |
+
max_tokens=1000
|
40 |
+
)
|
41 |
+
return response.choices[0].message["content"].strip()
|
42 |
+
def extract_ner_info(self, text):
|
43 |
+
ner_results = self.nlp(text)
|
44 |
+
full_name = ''
|
45 |
+
loc = ''
|
46 |
+
age = None
|
47 |
+
i = 0
|
48 |
+
while i < len(ner_results):
|
49 |
+
if ner_results[i]['entity'] == 'B-pers' and ner_results[i]['score'] >= 0.80:
|
50 |
+
if full_name:
|
51 |
+
full_name += ' '
|
52 |
+
full_name += ner_results[i]['word']
|
53 |
+
current_score = ner_results[i]['score']
|
54 |
+
stop_adding = False
|
55 |
+
for j in range(i + 1, len(ner_results)):
|
56 |
+
if ner_results[j]['entity'] == 'I-pers' and ner_results[j]['score'] >= 0.80:
|
57 |
+
if ner_results[j]['score'] >= current_score * 0.90:
|
58 |
+
full_name += ner_results[j]['word'].replace('##', '')
|
59 |
+
current_score = ner_results[j]['score']
|
60 |
+
i = j
|
61 |
+
else:
|
62 |
+
stop_adding = True
|
63 |
+
break
|
64 |
+
else:
|
65 |
+
stop_adding = True
|
66 |
+
break
|
67 |
+
if stop_adding:
|
68 |
+
break
|
69 |
+
i += 1
|
70 |
+
for entity in ner_results:
|
71 |
+
if entity['entity'] in ['B-loc', 'I-loc']:
|
72 |
+
if loc:
|
73 |
+
loc += ' '
|
74 |
+
loc += entity['word']
|
75 |
+
age_match = re.search(r'سن\s*:\s*(\d+)', text)
|
76 |
+
if age_match:
|
77 |
+
age = int(age_match.group(1))
|
78 |
+
else:
|
79 |
+
date_match = re.search(r'(\d{1,4}/\d{1,2}/\d{1,2})', text)
|
80 |
+
if date_match:
|
81 |
+
age = self.calculate_age(date_match.group(1))
|
82 |
+
else:
|
83 |
+
four_digit_match = re.search(r'(13\d{2})', text)
|
84 |
+
if four_digit_match:
|
85 |
+
age = self.calculate_age(four_digit_match.group(1))
|
86 |
+
return full_name, loc, age
|
87 |
+
|
88 |
+
def extract_skills(self, text, skill_model_name_or_path):
|
89 |
+
skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name_or_path)
|
90 |
+
skill_model = AutoModelForTokenClassification.from_pretrained(skill_model_name_or_path)
|
91 |
+
inputs = skill_tokenizer(text, return_tensors="pt")
|
92 |
+
with torch.no_grad():
|
93 |
+
outputs = skill_model(**inputs)
|
94 |
+
logits = outputs.logits
|
95 |
+
predictions = torch.argmax(logits, dim=2)
|
96 |
+
tokens = skill_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
97 |
+
tags = [skill_model.config.id2label[p.item()] for p in predictions[0]]
|
98 |
+
skills = []
|
99 |
+
temp_skill = ""
|
100 |
+
for token, tag in zip(tokens, tags):
|
101 |
+
if tag == "B-TECHNOLOGY":
|
102 |
+
if temp_skill:
|
103 |
+
skills.append(temp_skill.strip())
|
104 |
+
temp_skill = ""
|
105 |
+
skills.append(token)
|
106 |
+
elif tag == "B-TECHNICAL":
|
107 |
+
if temp_skill:
|
108 |
+
skills.append(temp_skill.strip())
|
109 |
+
temp_skill = ""
|
110 |
+
temp_skill = token
|
111 |
+
elif tag == "I-TECHNICAL":
|
112 |
+
temp_skill += token.replace('##', '')
|
113 |
+
if temp_skill:
|
114 |
+
skills.append(temp_skill.strip())
|
115 |
+
return list(set(skills))
|
116 |
+
|
117 |
+
|
118 |
+
def extract_education_resume(self, text):
|
119 |
+
response = openai.ChatCompletion.create(
|
120 |
+
model="gpt-3.5-turbo",
|
121 |
+
messages=[
|
122 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
123 |
+
{"role": "user", "content": f"Extract only the highest education degree and field from the following text:\n\n{text}\n\nFormat the response as 'Degree in Field' and nothing else."}
|
124 |
+
],
|
125 |
+
max_tokens=1000
|
126 |
+
)
|
127 |
+
return response.choices[0].message["content"].strip()
|
128 |
+
|
129 |
+
def extract_job_resume(self, text):
|
130 |
+
response = openai.ChatCompletion.create(
|
131 |
+
model="gpt-3.5-turbo",
|
132 |
+
messages=[
|
133 |
+
{"role": "system", "content": "You are a helpful assistant that extracts information from text."},
|
134 |
+
{"role": "user", "content": f"Extract only the last job title from the following text:\n\n{text}\n\nProvide just the job title and nothing else."}
|
135 |
+
],
|
136 |
+
max_tokens=1000
|
137 |
+
)
|
138 |
+
return response.choices[0].message["content"].strip()
|
139 |
+
|
140 |
+
def extract_resume_info(self, resume_text, skill_model_name_or_path):
|
141 |
+
# تابع استخراج اطلاعات کلی از رزومه
|
142 |
+
full_name, loc, age = self.extract_ner_info(resume_text)
|
143 |
+
translated_resume = self.translate_text(resume_text)
|
144 |
+
skills = self.extract_skills(translated_resume, skill_model_name_or_path)
|
145 |
+
education_resume = self.extract_education_resume(translated_resume)
|
146 |
+
title_job_resume = self.extract_job_resume(translated_resume)
|
147 |
+
return full_name, loc, age, skills, education_resume, title_job_resume
|
synthetic_data.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
def create_synthetic_data(job_skills, education, title_job, location, age_DS, num_rows=2000):
|
5 |
+
if isinstance(job_skills, str):
|
6 |
+
job_skills = [job_skills]
|
7 |
+
if isinstance(education, str):
|
8 |
+
education = [education]
|
9 |
+
if isinstance(title_job, str):
|
10 |
+
title_job = [title_job]
|
11 |
+
if isinstance(location, str):
|
12 |
+
location = [location]
|
13 |
+
if isinstance(age_DS, str):
|
14 |
+
age_DS = [age_DS]
|
15 |
+
|
16 |
+
features = job_skills + education + title_job + location + age_DS
|
17 |
+
data = np.random.randint(2, size=(num_rows, len(features)))
|
18 |
+
df = pd.DataFrame(data, columns=features)
|
19 |
+
df['initial_TARGET'] = df.sum(axis=1)
|
20 |
+
|
21 |
+
min_target = df['initial_TARGET'].min()
|
22 |
+
max_target = df['initial_TARGET'].max()
|
23 |
+
df['TARGET'] = (df['initial_TARGET'] - min_target) * (100 / (max_target - min_target))
|
24 |
+
df.drop(columns=['initial_TARGET'], inplace=True)
|
25 |
+
|
26 |
+
df.loc[df.sum(axis=1) == 0, 'TARGET'] = 0
|
27 |
+
df.loc[df.sum(axis=1) == len(features), 'TARGET'] = 100
|
28 |
+
|
29 |
+
return df
|