File size: 6,711 Bytes
839ab56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

import openai
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
class ResumeExtractor:
    def __init__(self, ner_model_name_or_path, openai_api_key):
        self.ner_model_name_or_path = ner_model_name_or_path
        self.tokenizer = AutoTokenizer.from_pretrained(ner_model_name_or_path)
        self.model = AutoModelForTokenClassification.from_pretrained(ner_model_name_or_path)
        self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
        openai.api_key = openai_api_key

    def calculate_age(self, date_string):
        current_year = 1403
        ymd_match = re.match(r'(\d{1,4})/(\d{1,2})/(\d{1,2})', date_string)
        if ymd_match:
            year = int(ymd_match.group(1))
            if len(ymd_match.group(1)) == 4:
                age = current_year - year
            else:
                year += 1300
                age = current_year - year
            return age
        four_digit_match = re.match(r'(13\d{2})', date_string)
        if four_digit_match:
            year = int(four_digit_match.group(1))
            age = current_year - year
            return age
        return None

    def translate_text(self, text, target_language="en"):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that translates text."},
                {"role": "user", "content": f"Translate the following text to {target_language}:\n\n{text}"}
            ],
            max_tokens=1000
        )
        return response.choices[0].message["content"].strip()
    def extract_ner_info(self, text):
        ner_results = self.nlp(text)
        full_name = ''
        loc = ''
        age = None
        i = 0
        while i < len(ner_results):
            if ner_results[i]['entity'] == 'B-pers' and ner_results[i]['score'] >= 0.80:
                if full_name:
                    full_name += ' '
                full_name += ner_results[i]['word']
                current_score = ner_results[i]['score']
                stop_adding = False
                for j in range(i + 1, len(ner_results)):
                    if ner_results[j]['entity'] == 'I-pers' and ner_results[j]['score'] >= 0.80:
                        if ner_results[j]['score'] >= current_score * 0.90:
                            full_name += ner_results[j]['word'].replace('##', '')
                            current_score = ner_results[j]['score']
                            i = j
                        else:
                            stop_adding = True
                            break
                    else:
                        stop_adding = True
                        break
                if stop_adding:
                    break
            i += 1
        for entity in ner_results:
            if entity['entity'] in ['B-loc', 'I-loc']:
                if loc:
                    loc += ' '
                loc += entity['word']
        age_match = re.search(r'سن\s*:\s*(\d+)', text)
        if age_match:
            age = int(age_match.group(1))
        else:
            date_match = re.search(r'(\d{1,4}/\d{1,2}/\d{1,2})', text)
            if date_match:
                age = self.calculate_age(date_match.group(1))
            else:
                four_digit_match = re.search(r'(13\d{2})', text)
                if four_digit_match:
                    age = self.calculate_age(four_digit_match.group(1))
        return full_name, loc, age

    def extract_skills(self, text, skill_model_name_or_path):
        skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name_or_path)
        skill_model = AutoModelForTokenClassification.from_pretrained(skill_model_name_or_path)
        inputs = skill_tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = skill_model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)
        tokens = skill_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        tags = [skill_model.config.id2label[p.item()] for p in predictions[0]]
        skills = []
        temp_skill = ""
        for token, tag in zip(tokens, tags):
            if tag == "B-TECHNOLOGY":
                if temp_skill:
                    skills.append(temp_skill.strip())
                    temp_skill = ""
                skills.append(token)
            elif tag == "B-TECHNICAL":
                if temp_skill:
                    skills.append(temp_skill.strip())
                    temp_skill = ""
                temp_skill = token
            elif tag == "I-TECHNICAL":
                temp_skill += token.replace('##', '')
        if temp_skill:
            skills.append(temp_skill.strip())
        return list(set(skills))


    def extract_education_resume(self, text):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts information from text."},
                {"role": "user", "content": f"Extract only the highest education degree and field from the following text:\n\n{text}\n\nFormat the response as 'Degree in Field' and nothing else."}
            ],
            max_tokens=1000
        )
        return response.choices[0].message["content"].strip()

    def extract_job_resume(self, text):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts information from text."},
                {"role": "user", "content": f"Extract only the last job title from the following text:\n\n{text}\n\nProvide just the job title and nothing else."}
            ],
            max_tokens=1000
        )
        return response.choices[0].message["content"].strip()

    def extract_resume_info(self, resume_text, skill_model_name_or_path):
        # تابع استخراج اطلاعات کلی از رزومه
        full_name, loc, age = self.extract_ner_info(resume_text)
        translated_resume = self.translate_text(resume_text)
        skills = self.extract_skills(translated_resume, skill_model_name_or_path)
        education_resume = self.extract_education_resume(translated_resume)
        title_job_resume = self.extract_job_resume(translated_resume)
        return full_name, loc, age, skills, education_resume, title_job_resume