khrek commited on
Commit
1ab9160
·
1 Parent(s): 77cdd98

Upload 2 files

Browse files
Files changed (2) hide show
  1. resume_parser.py +98 -64
  2. segmenter.py +34 -27
resume_parser.py CHANGED
@@ -1,10 +1,13 @@
1
  from itertools import chain
2
  from models import Models
3
- #from models.prototype.models import Models
4
  #from output_model import OutputModel, WorkExperience
5
  from segmenter import ResumeSegmenter
6
  from flashtext import KeywordProcessor
7
  from collections import defaultdict
 
 
 
 
8
  class ResumeParser():
9
  def __init__(self) -> None:
10
  self.resumeSegmenter = ResumeSegmenter()
@@ -28,99 +31,130 @@ class ResumeParser():
28
  dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
29
  dates_indexes = list(chain.from_iterable(dates_indexes))
30
  dates_indexes = [i + start_index for i in dates_indexes]
31
- #this list should be unique and ordered
32
  dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
33
  dates_indexes = set(dates_indexes)
34
- dates_indexes = list(dates_indexes)
35
-
36
- list_single_work_exp = []
37
- for i in range(len(dates_indexes)-1):
38
- index = dates_indexes[i]
39
- next_index = dates_indexes[i+1]
40
- section = resume_lines[index:next_index]
41
  if len(section) == 0:
42
- continue
43
- list_single_work_exp.append(section)
44
- return list_single_work_exp
 
45
 
46
  def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
47
- text_segments, sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
 
 
 
 
 
48
  start_index = sections[section_header][0]
49
  end_index = sections[section_header][1]
50
  #on the bases dates would be unique
51
  return start_index, end_index
52
 
53
  #more of a utils function
54
- def sort_tokens_table(tokens_data):
55
  table = {}
56
  for key, tokens in tokens_data:
57
  for token in tokens:
58
  table[token] = key
59
  return table
60
 
61
- def format_output(self, keywords, work_section_list, isWorkExp=True):
62
- if isWorkExp:
63
- headlines = [text[0] for text in work_section_list]
64
- else:
65
- headlines = work_section_list
66
- table = self.sort_tokens_table(keywords)
67
- tokens_processor = KeywordProcessor()
68
- list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
69
- tokens_processor.add_keywords_from_list(list_keywords)
70
  data = []
71
- for i, header in enumerate(headlines):
72
- current_data = defaultdict(list)
73
- tokens = tokens_processor.extract_keywords(header)
74
- for token in tokens:
75
- current_data[table[token]].append(token)
76
- if isWorkExp:
77
- current_data["description"] = work_section_list[i][1:]
78
- data.append(dict(current_data))
 
 
 
 
 
79
  return data
 
80
 
81
- def parse_work_history(self, resume_lines):
82
- start_index, end_index = self.extract_section_text(resume_lines)
83
- work_dates = self.models.get_ner(resume_lines[start_index:end_index], "date")
 
 
 
84
  single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
85
- job_positions = self.models.get_ner(resume_lines[start_index:end_index], "job title")
86
- companies = self.models.get_ner(resume_lines[start_index:end_index], "company")
87
- keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
88
  return self.format_output(keywords, single_work_experiences)
89
 
90
- def parse_education(self, resume_lines):
91
- start_index, end_index = self.extract_section_text(resume_lines, "education_and_training")
92
- tokens = ["degree", "university", "degree field", "date", "location"]
93
 
94
- for token in tokens:
95
- keywords = self.get_ner(resume_lines[start_index+1:end_index], token)
96
- output = self.format_output(keywords, resume_lines[start_index:end_index], False)
 
 
 
 
 
97
  output = [res for res in output if res]
98
 
99
  return output
100
 
101
- def parse_basic_info(self,resume_lines):
102
- start_index, end_index = self.extract_section_text(resume_lines, "basics_info")
103
- #tokens = ["person", "email", "phone"]
104
- tokens = ["person"]
105
- for token in tokens:
106
- keywords = self.models.get_ner(resume_lines[start_index:end_index], token)
 
 
 
107
 
108
  output = {}
109
- for token, result in keywords:
110
- if len(result) > 0:
111
- output[token] = result[0]
 
 
 
 
 
 
 
 
 
 
 
112
  return output
113
 
114
  def parse(self, resume_lines):
115
- jobs = self.parse_work_history(resume_lines)
116
- education = self.parse_education(resume_lines)
117
- basic_info = self.parse_basic_info(resume_lines)
118
-
119
- return {"basic_info":basic_info, "education":education, "work_experience":jobs}
120
-
121
-
122
-
123
-
124
-
125
-
126
-
 
 
 
 
 
 
 
 
 
 
1
  from itertools import chain
2
  from models import Models
 
3
  #from output_model import OutputModel, WorkExperience
4
  from segmenter import ResumeSegmenter
5
  from flashtext import KeywordProcessor
6
  from collections import defaultdict
7
+ import re
8
+ import wordninja
9
+ from utils import percentage_difference
10
+ from nltk import word_tokenize
11
  class ResumeParser():
12
  def __init__(self) -> None:
13
  self.resumeSegmenter = ResumeSegmenter()
 
31
  dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
32
  dates_indexes = list(chain.from_iterable(dates_indexes))
33
  dates_indexes = [i + start_index for i in dates_indexes]
 
34
  dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
35
  dates_indexes = set(dates_indexes)
36
+ dates_indexes = sorted(list(dates_indexes))
37
+ individual_sections = []
38
+ for i, index in enumerate(dates_indexes):
39
+ section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
 
 
 
40
  if len(section) == 0:
41
+ continue
42
+ individual_sections.append(section)
43
+
44
+ return individual_sections
45
 
46
  def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
47
+ _ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
48
+ if sections is None:
49
+ return None
50
+ print(sections)
51
+ if section_header not in sections:
52
+ return None
53
  start_index = sections[section_header][0]
54
  end_index = sections[section_header][1]
55
  #on the bases dates would be unique
56
  return start_index, end_index
57
 
58
  #more of a utils function
59
+ def sort_tokens_table(self, tokens_data):
60
  table = {}
61
  for key, tokens in tokens_data:
62
  for token in tokens:
63
  table[token] = key
64
  return table
65
 
66
+ def format_output(self, keywords, headlines, isWorkExp=True):
 
 
 
 
 
 
 
 
67
  data = []
68
+ for section in headlines:
69
+ extracted_data = {}
70
+
71
+ paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
72
+ extracted_data['description'] = paragraph
73
+ recovered_headlines = ' '.join(wordninja.split(paragraph))
74
+ if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
75
+ extracted_data['description'] = recovered_headlines
76
+ for attr in keywords:
77
+ result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
78
+ if len(result) > 0:
79
+ extracted_data[attr[0]] = result
80
+ data.append(extracted_data)
81
  return data
82
+
83
 
84
+ def parse_work_history(self, resume_lines, sections):
85
+
86
+ start_index, end_index = sections['work_and_employment']
87
+ text = ' '.join(resume_lines[start_index:end_index])
88
+ recovered_text = ' '.join(wordninja.split(text))
89
+ work_dates, companies, locations = self.models.get_ner(text, recovered_text)
90
  single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
91
+ entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
92
+ job_positions = entity_dict['job title']
93
+ keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
94
  return self.format_output(keywords, single_work_experiences)
95
 
96
+ def parse_education(self, resume_lines, sections):
97
+ start_index, end_index = sections["education_and_training"]
98
+ text = ' '.join(resume_lines[start_index:end_index])
99
 
100
+ dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
101
+ single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
102
+ entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
103
+ degrees = entity_dict['degree']
104
+ majors = entity_dict['major']
105
+ keywords = [("date", dates), ("major", majors), ("degree", degrees),
106
+ ("university", universities), ("location", locations)]
107
+ output = self.format_output(keywords, single_education_experiences, False)
108
  output = [res for res in output if res]
109
 
110
  return output
111
 
112
+ def parse_basic_info(self,resume_lines, sections):
113
+
114
+ start_index, end_index = sections["basics_info"]
115
+ text = ' '.join(resume_lines[start_index:end_index])
116
+ phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
117
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
118
+ entites = self.models.ner(text)
119
+ if len(entites) == 0:
120
+ entites = self.models.ner(' '.join(resume_lines))
121
 
122
  output = {}
123
+ score = 0
124
+ for entity in entites:
125
+ if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
126
+ output['name']= entity['word']
127
+ score = entity['score']
128
+
129
+ email = re.findall(email_pattern, text)
130
+ phone = re.findall(phone_pattern, text)
131
+ if email == '':
132
+ email = re.findall(email_pattern, ' '.join(resume_lines))
133
+ if phone == '':
134
+ phone = re.findall(phone_pattern, ' '.join(resume_lines))
135
+ output['email'] = email[0] if len(email) > 0 else ''
136
+ output['phone'] = phone[0] if len(phone) > 0 else ''
137
  return output
138
 
139
  def parse(self, resume_lines):
140
+ self.resumeSegmenter.resume_segments = {
141
+ 'objective': [],
142
+ 'work_and_employment': [],
143
+ 'education_and_training': [],
144
+ 'skills': [],
145
+ 'accomplishments': [],
146
+ 'misc': []
147
+ }
148
+ self.resumeSegmenter.resume_indices = []
149
+ sections = self.resumeSegmenter.segment(resume_lines)
150
+ if sections is None:
151
+ return {}
152
+ jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
153
+ education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
154
+ basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
155
+ result = {"basic_info":basic_info, "education":education, "work_experience":jobs}
156
+ for section in sections.keys():
157
+ if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
158
+ text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
159
+ result[section] =' '.join(wordninja.split(text))
160
+ return result
segmenter.py CHANGED
@@ -1,8 +1,14 @@
1
  from flashtext import KeywordProcessor
2
  import json
 
 
 
 
 
3
  class ResumeSegmenter():
4
 
5
  def __init__(self):
 
6
  self.resume_segments = {
7
  'objective': [],
8
  'work_and_employment': [],
@@ -12,41 +18,37 @@ class ResumeSegmenter():
12
  'misc': []
13
  }
14
  self.resume_indices = []
 
 
 
 
 
 
15
 
16
- def get_average_line_len(self, lines):
17
- sum = 0
18
- for line in lines:
19
- sum+=len(line)
20
- return sum / len(lines)
21
 
22
- def get_average_words_per_line(self, lines):
23
- sum = 0
24
- for line in lines:
25
- #other stopwords too?
26
- sum+= len(line.split(' '))
27
- return sum/ len(lines)
28
 
29
  def find_segment_indices(self, text_list):
30
- with open(r"./sections.json") as f:
31
- data = json.load(f)
32
- section_headers = data["section_headers"]
33
- f.close()
34
- keyword_processor = KeywordProcessor()
35
- keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
36
- average_words_per_line = self.get_average_words_per_line(text_list)
37
-
38
  for i, line in enumerate(text_list):
 
39
  if line[0].islower() or line[-1] == '.':
40
  continue
41
- kys = keyword_processor.extract_keywords(line)
 
 
 
 
42
  if len(kys) > 0:
43
- #other stopwords? from where? nltk lib ? pos tagger?
44
- if len(line.split(" ")) > average_words_per_line * 0.75:
45
  continue
46
- #is it necessary to keep the actual raw keyword?
47
  self.resume_indices.append(i)
48
  self.resume_segments[kys[0]].append(i)
49
-
 
50
  def slice_segments(self, lines):
51
  sections = {}
52
  if len(self.resume_indices) == 0:
@@ -73,12 +75,14 @@ class ResumeSegmenter():
73
  start = max(s[0], interval[0])
74
  end = min(s[1], interval[1])
75
  return [start, end], section
 
76
  def segment(self, resume_lines):
77
  self.find_segment_indices(resume_lines)
78
  sections = self.slice_segments(resume_lines)
79
- #whats the naming convention here sections_list or list_sections???
 
80
  sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
81
- intersection_intervals = []
82
 
83
  for i, s in enumerate(sections_list[:-1]):
84
  result = self.get_interval_intersection(sections_list[i+1:], s[1])
@@ -90,14 +94,17 @@ class ResumeSegmenter():
90
  intersection_intervals.append((a,b,s[0]))
91
 
92
  if len(intersection_intervals) > 0:
93
- print("there are intersections", intersection_intervals)
94
  #needs last method of cleaning overlapping intervals with zero shot
95
  #classifier + substract intervals
96
  return sections
97
 
98
  def get_parsed_sections(self, resume_lines):
 
99
  text_segments = {}
100
  sections = self.segment(resume_lines)
 
 
101
  for header_title, section in sections.items():
102
  lines = resume_lines[section[0]:section[1]]
103
  text_segments[header_title] = lines
 
1
  from flashtext import KeywordProcessor
2
  import json
3
+ import nltk
4
+ from nltk.tokenize import word_tokenize,LineTokenizer
5
+ from utils import get_average_words_per_line, get_average_line_len
6
+ import wordninja
7
+ nltk.download('punkt')
8
  class ResumeSegmenter():
9
 
10
  def __init__(self):
11
+ #has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections
12
  self.resume_segments = {
13
  'objective': [],
14
  'work_and_employment': [],
 
18
  'misc': []
19
  }
20
  self.resume_indices = []
21
+ with open(r"models/prototype/sections.json") as f:
22
+ data = json.load(f)
23
+ self.section_headers = data["section_headers"]
24
+ f.close()
25
+ self.keyword_processor = KeywordProcessor()
26
+ self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers)
27
 
 
 
 
 
 
28
 
 
 
 
 
 
 
29
 
30
  def find_segment_indices(self, text_list):
31
+
32
+ average_words_per_line = get_average_words_per_line(text_list)
33
+ average_sentence_length = get_average_line_len(text_list)
34
+
 
 
 
 
35
  for i, line in enumerate(text_list):
36
+ line_tokenized = LineTokenizer(blanklines='discard').tokenize(line)
37
  if line[0].islower() or line[-1] == '.':
38
  continue
39
+ kys = self.keyword_processor.extract_keywords(line)
40
+ if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []:
41
+ text_list[i] = line = ' '.join(word_tokenize(line))
42
+ kys = self.keyword_processor.extract_keywords(line)
43
+
44
  if len(kys) > 0:
45
+ if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length:
 
46
  continue
47
+
48
  self.resume_indices.append(i)
49
  self.resume_segments[kys[0]].append(i)
50
+
51
+
52
  def slice_segments(self, lines):
53
  sections = {}
54
  if len(self.resume_indices) == 0:
 
75
  start = max(s[0], interval[0])
76
  end = min(s[1], interval[1])
77
  return [start, end], section
78
+
79
  def segment(self, resume_lines):
80
  self.find_segment_indices(resume_lines)
81
  sections = self.slice_segments(resume_lines)
82
+ if sections is None:
83
+ return None
84
  sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
85
+ """intersection_intervals = []
86
 
87
  for i, s in enumerate(sections_list[:-1]):
88
  result = self.get_interval_intersection(sections_list[i+1:], s[1])
 
94
  intersection_intervals.append((a,b,s[0]))
95
 
96
  if len(intersection_intervals) > 0:
97
+ print("there are intersections", intersection_intervals)"""
98
  #needs last method of cleaning overlapping intervals with zero shot
99
  #classifier + substract intervals
100
  return sections
101
 
102
  def get_parsed_sections(self, resume_lines):
103
+
104
  text_segments = {}
105
  sections = self.segment(resume_lines)
106
+ if sections is None:
107
+ return None, None
108
  for header_title, section in sections.items():
109
  lines = resume_lines[section[0]:section[1]]
110
  text_segments[header_title] = lines