File size: 4,777 Bytes
0d375ed
 
1ab9160
 
 
 
 
0d375ed
15d0f45
0d375ed
1ab9160
0d375ed
 
 
 
 
 
 
 
 
1ab9160
 
 
 
 
 
0d375ed
 
 
 
1ab9160
 
 
 
0d375ed
1ab9160
0d375ed
 
1ab9160
 
 
 
 
0d375ed
1ab9160
0d375ed
1ab9160
0d375ed
 
1ab9160
 
0d375ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ab9160
0d375ed
 
 
1ab9160
 
0d375ed
1ab9160
0d375ed
 
 
 
 
 
 
 
 
 
 
1ab9160
0d375ed
 
 
 
 
1ab9160
0d375ed
 
1ab9160
 
0d375ed
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from flashtext import KeywordProcessor
import json
import nltk
from nltk.tokenize import word_tokenize,LineTokenizer
from utils import get_average_words_per_line, get_average_line_len
import wordninja
nltk.download('punkt')
class ResumeSegmenter():
    
        def __init__(self):
            #has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections
            self.resume_segments = {
                'objective': [],
                'work_and_employment': [],
                'education_and_training': [],
                'skills': [],
                'accomplishments': [],
                'misc': []
            }
            self.resume_indices = []
            with open(r"models/prototype/sections.json") as f:
                data = json.load(f)
                self.section_headers = data["section_headers"]
            f.close()
            self.keyword_processor = KeywordProcessor()
            self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers)        
        
        
    
        def find_segment_indices(self, text_list):
            
            average_words_per_line = get_average_words_per_line(text_list)
            average_sentence_length = get_average_line_len(text_list)
           
            for i, line in enumerate(text_list):
                line_tokenized = LineTokenizer(blanklines='discard').tokenize(line)
                if line[0].islower() or line[-1] == '.':
                    continue
                kys =  self.keyword_processor.extract_keywords(line)
                if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []:
                    text_list[i] = line = ' '.join(word_tokenize(line))
                    kys =  self.keyword_processor.extract_keywords(line)

                if len(kys) > 0:
                    if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length:
                        continue
                    
                    self.resume_indices.append(i)
                    self.resume_segments[kys[0]].append(i)
                

        def slice_segments(self, lines):
            sections = {}
            if len(self.resume_indices) == 0:
                return None

            for section, points in self.resume_segments.items():
                if len(points) == 0: continue
                start_point = points[0]
                tmp_end_point = points[-1]
                end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
                                            len(self.resume_indices)-1)]
                if start_point == self.resume_indices[-1]:
                    end_point = len(lines)
                sections[section] = (start_point, end_point)
            sections["basics_info"] = (0, self.resume_indices[0])
            return sections
        
        def get_interval_intersection(self, sections, interval):
            for section in sections:
                s = section[1]
                if s[0] >= interval[1] or interval[0] >= s[1]:
                    return None
                else:
                    start = max(s[0], interval[0])
                    end = min(s[1], interval[1])
                    return [start, end], section
                
        def segment(self, resume_lines):
            self.find_segment_indices(resume_lines)
            sections = self.slice_segments(resume_lines)
            if sections is None:
                return None
            sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
            """intersection_intervals = []
            
            for i, s in enumerate(sections_list[:-1]):
                result = self.get_interval_intersection(sections_list[i+1:], s[1])
                if result is None:
                    continue
                else:
                    a,b = result
                    print(a,b,s[0])
                    intersection_intervals.append((a,b,s[0]))
                    
            if len(intersection_intervals) > 0:
                print("there are intersections", intersection_intervals)"""
            #needs last method of cleaning overlapping intervals with zero shot
            #classifier + substract intervals
            return sections
        
        def get_parsed_sections(self, resume_lines):
       
            text_segments = {}
            sections = self.segment(resume_lines)
            if sections is None:
                return None, None
            for header_title, section in sections.items():
                lines = resume_lines[section[0]:section[1]]
                text_segments[header_title] = lines
    
            return text_segments, sections