File size: 4,410 Bytes
0d375ed
 
 
15d0f45
0d375ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from flashtext import KeywordProcessor
import json
class ResumeSegmenter():
    
        def __init__(self):
            self.resume_segments = {
                'objective': [],
                'work_and_employment': [],
                'education_and_training': [],
                'skills': [],
                'accomplishments': [],
                'misc': []
            }
            self.resume_indices = []
        
        def get_average_line_len(self, lines):
            sum = 0
            for line in lines:
                sum+=len(line)
            return sum / len(lines)
        
        def get_average_words_per_line(self, lines):
            sum = 0
            for line in lines:
                #other stopwords too?
                sum+= len(line.split(' '))
            return sum/ len(lines)
    
        def find_segment_indices(self, text_list):
            with open(r"./sections.json") as f:
                data = json.load(f)
                section_headers = data["section_headers"]
            f.close()
            keyword_processor = KeywordProcessor()
            keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
            average_words_per_line = self.get_average_words_per_line(text_list)

            for i, line in enumerate(text_list):
                if line[0].islower() or line[-1] == '.':
                    continue
                kys =  keyword_processor.extract_keywords(line)
                if len(kys) > 0:
                    #other stopwords? from where? nltk lib ? pos tagger?
                    if len(line.split(" ")) > average_words_per_line * 0.75:
                        continue
                    #is it necessary to keep the actual raw keyword? 
                    self.resume_indices.append(i)
                    self.resume_segments[kys[0]].append(i)
                 
        def slice_segments(self, lines):
            sections = {}
            if len(self.resume_indices) == 0:
                return None

            for section, points in self.resume_segments.items():
                if len(points) == 0: continue
                start_point = points[0]
                tmp_end_point = points[-1]
                end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1,
                                            len(self.resume_indices)-1)]
                if start_point == self.resume_indices[-1]:
                    end_point = len(lines)
                sections[section] = (start_point, end_point)
            sections["basics_info"] = (0, self.resume_indices[0])
            return sections
        
        def get_interval_intersection(self, sections, interval):
            for section in sections:
                s = section[1]
                if s[0] >= interval[1] or interval[0] >= s[1]:
                    return None
                else:
                    start = max(s[0], interval[0])
                    end = min(s[1], interval[1])
                    return [start, end], section
        def segment(self, resume_lines):
            self.find_segment_indices(resume_lines)
            sections = self.slice_segments(resume_lines)
            #whats the naming convention here sections_list or list_sections???
            sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
            intersection_intervals = []
            
            for i, s in enumerate(sections_list[:-1]):
                result = self.get_interval_intersection(sections_list[i+1:], s[1])
                if result is None:
                    continue
                else:
                    a,b = result
                    print(a,b,s[0])
                    intersection_intervals.append((a,b,s[0]))
                    
            if len(intersection_intervals) > 0:
                print("there are intersections", intersection_intervals)
            #needs last method of cleaning overlapping intervals with zero shot
            #classifier + substract intervals
            return sections
        
        def get_parsed_sections(self, resume_lines):
            text_segments = {}
            sections = self.segment(resume_lines)
            for header_title, section in sections.items():
                lines = resume_lines[section[0]:section[1]]
                text_segments[header_title] = lines
    
            return text_segments, sections