Spaces:
Runtime error
Runtime error
feat(app.py): update app.py
Browse files- .gitignore +1 -0
- app.py +43 -0
- data/W020230619818476939351.xls +0 -0
- data/W020230619818476975218.xls +0 -0
- predictor/__init__.py +443 -0
- requirements.txt +7 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
**/__pycache__
|
app.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from predictor import Predictor
|
2 |
+
from transformers import pipeline
|
3 |
+
from huggingface_hub import login
|
4 |
+
from datetime import date
|
5 |
+
import os
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
login(os.environ["HF_Token"])
|
9 |
+
paths = [
|
10 |
+
"data/W020230619818476939351.xls",
|
11 |
+
"data/W020230619818476975218.xls"
|
12 |
+
]
|
13 |
+
predictor = Predictor(
|
14 |
+
pipelines={
|
15 |
+
"name": pipeline("nerpipe", model="minskiter/resume-token-classification-name-0708",trust_remote_code=True,use_auth_token=True),
|
16 |
+
"common": pipeline("nerpipe",model="minskiter/resume-token-classification",trust_remote_code=True,use_auth_token=True)
|
17 |
+
},
|
18 |
+
paths=paths,
|
19 |
+
today=date(2023,4,1)
|
20 |
+
)
|
21 |
+
|
22 |
+
def ner_predictor_gradio(input):
|
23 |
+
entities = predictor(input)
|
24 |
+
# flattern entities
|
25 |
+
flatterns = []
|
26 |
+
for key in entities:
|
27 |
+
if isinstance(entities[key],list):
|
28 |
+
for item in entities[key]:
|
29 |
+
if isinstance(item,list):
|
30 |
+
for subitem in item:
|
31 |
+
flatterns.append(subitem)
|
32 |
+
else:
|
33 |
+
flatterns.append(item)
|
34 |
+
return {"text":input, "entities": flatterns}
|
35 |
+
|
36 |
+
demo = gr.Interface(
|
37 |
+
fn=ner_predictor_gradio,
|
38 |
+
inputs=gr.Textbox(lines=5, label="输入你的简历"),
|
39 |
+
outputs=gr.HighlightedText(label="简历识别结果"),
|
40 |
+
)
|
41 |
+
demo.launch()
|
42 |
+
|
43 |
+
|
data/W020230619818476939351.xls
ADDED
Binary file (429 kB). View file
|
|
data/W020230619818476975218.xls
ADDED
Binary file (56.3 kB). View file
|
|
predictor/__init__.py
ADDED
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Dict,List
|
2 |
+
from transformers import Pipeline
|
3 |
+
import requests
|
4 |
+
import re
|
5 |
+
from io import BytesIO
|
6 |
+
import pandas as pd
|
7 |
+
import math
|
8 |
+
import queue
|
9 |
+
from datetime import date
|
10 |
+
|
11 |
+
class Predictor():
|
12 |
+
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
pipelines: Dict[str, Pipeline] = {},
|
16 |
+
paths: List[str] = [],
|
17 |
+
today: date = date.today()
|
18 |
+
) -> None:
|
19 |
+
if "name" not in pipelines:
|
20 |
+
raise ValueError("'name' pipeline is None")
|
21 |
+
if "common" not in pipelines:
|
22 |
+
raise ValueError("'common' pipeline is None")
|
23 |
+
self.pipelines = pipelines
|
24 |
+
self.today = today
|
25 |
+
self.__init_split_data()
|
26 |
+
self.__init_schools_data(paths)
|
27 |
+
self.__init_patterns()
|
28 |
+
|
29 |
+
def __init_patterns(
|
30 |
+
self
|
31 |
+
):
|
32 |
+
last_name = r"[赵,钱,孙,李,周,吴,郑,王,冯,陈,楮,卫,蒋,沈,韩,杨,朱,秦,尤,许,何,吕,施,张,孔,曹,严,华,金,魏,陶,姜,戚,谢,邹,喻,"\
|
33 |
+
+r"柏,水,窦,章,云,苏,潘,葛,奚,范,彭,郎,鲁,韦,昌,马,苗,凤,花,方,俞,任,袁,柳,酆,鲍,史,唐,费,廉,岑,薛,雷,贺,倪,汤,滕,殷,罗," \
|
34 |
+
+ r"毕,郝,邬,安,常,乐,于,时,傅,皮,卞,齐,康,伍,余,元,卜,顾,孟,平,黄,和,穆,萧,尹,姚,邵,湛,汪,祁,毛,禹,狄,米,贝,明,臧,计,伏,成,戴,谈,宋,茅," \
|
35 |
+
+ r"庞,熊,纪,舒,屈,项,祝,董,梁,杜,阮,蓝,闽,席,季,麻,强,贾,路,娄,危,江,童,颜,郭,梅,盛,林,刁,锺,徐,丘,骆,高,夏,蔡,田,樊,胡,凌,霍,虞,万,支," \
|
36 |
+
+ r"柯,昝,管,卢,莫,经,房,裘,缪,干,解,应,宗,丁,宣,贲,邓,郁,单,杭,洪,包,诸,左,石,崔,吉,钮,龚,程,嵇,邢,滑,裴,陆,荣,翁,荀,羊,於,惠,甄,麹,家," \
|
37 |
+
+ r"封,芮,羿,储,靳,汲,邴,糜,松,井,段,富,巫,乌,焦,巴,弓,牧,隗,山,谷,车,侯,宓,蓬,全,郗,班,仰,秋,仲,伊,宫,宁,仇,栾,暴,甘,斜,厉,戎,祖,武,符," \
|
38 |
+
+ r"刘,景,詹,束,龙,叶,幸,司,韶,郜,黎,蓟,薄,印,宿,白,怀,蒲,邰,从,鄂,索,咸,籍,赖,卓,蔺,屠,蒙,池,乔,阴,郁,胥,能,苍,双,闻,莘,党,翟,谭,贡,劳," \
|
39 |
+
+ r"逄,姬,申,扶,堵,冉,宰,郦,雍,郤,璩,桑,桂,濮,牛,寿,通,边,扈,燕,冀,郏,浦,尚,农,温,别,庄,晏,柴,瞿,阎,充,慕,连,茹,习,宦,艾,鱼,容,向,古,易," \
|
40 |
+
+ r"慎,戈,廖,庾,终,暨,居,衡,步,都,耿,满,弘,匡,国,文,寇,广,禄,阙,东,欧,殳,沃,利,蔚,越,夔,隆,师,巩,厍,聂,晁,勾,敖,融,冷,訾,辛,阚,那,简,饶," \
|
41 |
+
+ r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
|
42 |
+
+ r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
|
43 |
+
+ r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
|
44 |
+
+ r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱]"
|
45 |
+
first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
|
46 |
+
self.name_pattern = re.compile(last_name + first_name)
|
47 |
+
self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
|
48 |
+
self.email_pattern = re.compile(r'([a-zA-Z0-9_-] {0,4})+@([a-zA-Z0-9_-] {0,4})+(\. {0,4}([a-zA-Z0-9_-] {0,4})+)+')
|
49 |
+
self.gender_pattern = re.compile(r'(性 {0,8}别.*?)?\s*?(男|女)\s+?')
|
50 |
+
self.age_patterns = [
|
51 |
+
re.compile(r"(\d{1,2})岁|年龄.{0,10}(\d{1,2})"),
|
52 |
+
re.compile(r"生.{0,12}(([12]\d{3})[年|.]?(([01]?\d)[月|.]?)?(([0-3]?\d)[日|.]?)?)"),
|
53 |
+
]
|
54 |
+
self.works_key_pattern = re.compile("工作经(历|验)|experience",re.M|re.I)
|
55 |
+
self.job_time_patterns = re.compile('([1-2]\d{3}(\D?[01]?\d){0,2})\D?([1-2]\d{3}(\D?[01]?\d){0,2}|至今)')
|
56 |
+
self.edu_index = ["博士","硕士","研究生","学士","本科","大专","专科","中专","高中","初中","小学"]
|
57 |
+
self.edu_patterns = list(re.compile(i) for i in self.edu_index)
|
58 |
+
self.school_pattern = re.compile(r"([a-zA-Z0-9 \u4e00-\u9fa5]{1,18}(学院|大学|中学|小学|学校|Unverisity|College))")
|
59 |
+
|
60 |
+
def _is_url(self, path: str):
|
61 |
+
return path.startswith('http://') or path.startswith('https://')
|
62 |
+
|
63 |
+
def __init_schools_data(
|
64 |
+
self,
|
65 |
+
paths: List[str],
|
66 |
+
):
|
67 |
+
schools = {}
|
68 |
+
headers = {
|
69 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
|
70 |
+
}
|
71 |
+
for path in paths:
|
72 |
+
stream = None
|
73 |
+
if self._is_url(path):
|
74 |
+
res = requests.get(path,headers=headers)
|
75 |
+
|
76 |
+
if res.status_code==200:
|
77 |
+
stream = BytesIO(res.content)
|
78 |
+
else:
|
79 |
+
with open(path, 'rb') as f:
|
80 |
+
stream = BytesIO(f.read())
|
81 |
+
df = pd.read_excel(stream)
|
82 |
+
for row in df.iterrows():
|
83 |
+
if isinstance(row[1][1],float) and math.isnan(row[1][1]):
|
84 |
+
continue
|
85 |
+
if row[1][1]=='学校名称':
|
86 |
+
continue
|
87 |
+
# [学校] = 学历(本科、专科)
|
88 |
+
if len(row[1])>5:
|
89 |
+
schools[row[1][1]] = row[1][5]
|
90 |
+
else:
|
91 |
+
schools[row[1][1]] = "成人学校"
|
92 |
+
self.schools = schools
|
93 |
+
if len(schools)==0:
|
94 |
+
raise ValueError("学校数据为空")
|
95 |
+
|
96 |
+
def __init_split_data(
|
97 |
+
self
|
98 |
+
):
|
99 |
+
self.splits = {'\\', '_', '"', '%', '{', '《', ')', '$', '(', '\n', '~', '*', ':', '!', ';', '”', '’', '\t', '?', '-', ';', '》', '】', '`', '、', '+', '“', '[', '—', '·', ')', '=', '‘', '}', '?', ',', '&', '@', '#', ']', '——', ' ', '.', '【', "'", '>', ',', '/', ':', '。', '...', '^', '(', '<', '|', '……', '!'}
|
100 |
+
|
101 |
+
def to_date(self, datestr:str):
|
102 |
+
if re.match("^\d{4}$",datestr):
|
103 |
+
return date(int(datestr),1,1)
|
104 |
+
match = re.match("^\d{4}(\D)\d{1,2}",datestr)
|
105 |
+
if match is not None:
|
106 |
+
try:
|
107 |
+
m = min(max(int(datestr.split(match.group(1))[1]),1),12)
|
108 |
+
return date(int(datestr.split(match.group(1))[0]),m,1)
|
109 |
+
except ValueError:
|
110 |
+
print(int(datestr.split(match.group(1))[0]),int(datestr.split(match.group(1))[1]))
|
111 |
+
raise
|
112 |
+
if datestr=="至今":
|
113 |
+
return self.today
|
114 |
+
return self.today
|
115 |
+
|
116 |
+
def split_to_blocks(
|
117 |
+
self,
|
118 |
+
text: str,
|
119 |
+
max_block_len: int = 510,
|
120 |
+
overlap: bool = True,
|
121 |
+
max_overlap_len: int = 20,
|
122 |
+
):
|
123 |
+
block = {
|
124 |
+
"start": -1,
|
125 |
+
"end": -1,
|
126 |
+
"text": "",
|
127 |
+
}
|
128 |
+
blocks = []
|
129 |
+
overlap_end = queue.Queue()
|
130 |
+
for i in range(len(text)):
|
131 |
+
if text[i] in self.splits:
|
132 |
+
if block["start"]==-1:
|
133 |
+
continue
|
134 |
+
if block["end"]!=-1 and i-block['start']>=max_block_len:
|
135 |
+
block["text"] = text[block["start"]:block["end"]]
|
136 |
+
blocks.append(block)
|
137 |
+
block = {
|
138 |
+
"start": overlap_end.queue[0]+1 if overlap else block['end']+1,
|
139 |
+
"end": -1,
|
140 |
+
"text": "",
|
141 |
+
}
|
142 |
+
block["end"] = i
|
143 |
+
while overlap_end.qsize()>0 and overlap_end.queue[0]+max_overlap_len<=i:
|
144 |
+
overlap_end.get()
|
145 |
+
overlap_end.put(i)
|
146 |
+
else:
|
147 |
+
if block["start"]==-1:
|
148 |
+
block["start"] = i
|
149 |
+
# last block
|
150 |
+
if block["start"]!=-1:
|
151 |
+
block["end"] = len(text)
|
152 |
+
block["text"] = text[block["start"]:block["end"]]
|
153 |
+
blocks.append(block)
|
154 |
+
return blocks
|
155 |
+
|
156 |
+
def get_expand_span(
|
157 |
+
self,
|
158 |
+
text: str,
|
159 |
+
start: int,
|
160 |
+
end: int,
|
161 |
+
max_expand_length=20,
|
162 |
+
):
|
163 |
+
expand_l,expand_r = start,end
|
164 |
+
for l in range(max(start-max_expand_length,0), start):
|
165 |
+
if text[l] in self.splits:
|
166 |
+
expand_l = l+1
|
167 |
+
break
|
168 |
+
for r in range(min(end+max_expand_length,len(text)-1), end, -1):
|
169 |
+
if text[r] in self.splits:
|
170 |
+
expand_r = r
|
171 |
+
break
|
172 |
+
return text[expand_l:expand_r], expand_l, expand_r
|
173 |
+
|
174 |
+
def remove_blanks(
|
175 |
+
self,
|
176 |
+
text: str,
|
177 |
+
blank_pattern: re.Pattern,
|
178 |
+
):
|
179 |
+
index_mapper = {}
|
180 |
+
new_text = []
|
181 |
+
for i in range(len(text)):
|
182 |
+
if blank_pattern.match(text[i]) is not None:
|
183 |
+
continue
|
184 |
+
index_mapper[len(new_text)] = i
|
185 |
+
new_text.append(text[i])
|
186 |
+
return ''.join(new_text), index_mapper
|
187 |
+
|
188 |
+
def process(self, text)->Dict[str, Any]:
|
189 |
+
return_obj = {
|
190 |
+
"name": [],
|
191 |
+
"age": [],
|
192 |
+
"gender": [],
|
193 |
+
"phone": [],
|
194 |
+
"email": [],
|
195 |
+
"schools": [],
|
196 |
+
"work_time": 0,
|
197 |
+
"edus": [],
|
198 |
+
"jobs": [],
|
199 |
+
"titles": []
|
200 |
+
}
|
201 |
+
# 获取名字,先过滤所有空白字符,防止名字中间有空格
|
202 |
+
remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
|
203 |
+
for name_match in self.name_pattern.finditer(remove_blanks_text):
|
204 |
+
start,end = name_match.span()
|
205 |
+
expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
|
206 |
+
entities = self.pipelines['name'](expand_text)
|
207 |
+
for entity in entities:
|
208 |
+
if entity['entity']=='NAME' and self.name_pattern.match(entity['word']) is not None:
|
209 |
+
obj = {
|
210 |
+
'start': index_mapper[start+entity['start']],
|
211 |
+
'end': index_mapper[start+entity['end']-1]+1,
|
212 |
+
'entity': 'NAME',
|
213 |
+
'text': entity['word']
|
214 |
+
}
|
215 |
+
repeat = False
|
216 |
+
for o in return_obj['name']:
|
217 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
218 |
+
repeat = True
|
219 |
+
break
|
220 |
+
if not repeat:
|
221 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
222 |
+
return_obj['name'].append(obj)
|
223 |
+
# 获取年龄
|
224 |
+
for age_match in self.age_patterns[0].finditer(remove_blanks_text):
|
225 |
+
age = None
|
226 |
+
s,e = -1,-1
|
227 |
+
if age_match.group(1) is not None:
|
228 |
+
age = age_match.group(1)
|
229 |
+
s,e = age_match.span(1)
|
230 |
+
elif age_match.group(2) is not None:
|
231 |
+
age = age_match.group(2)
|
232 |
+
s,e = age_match.span(1)
|
233 |
+
if age is not None:
|
234 |
+
return_obj['age'].append({
|
235 |
+
'start': index_mapper[s],
|
236 |
+
'end': index_mapper[e-1]+1,
|
237 |
+
'text': str(age),
|
238 |
+
'entity': 'AGE',
|
239 |
+
'origin': text[index_mapper[s]:index_mapper[e-1]+1]
|
240 |
+
})
|
241 |
+
for age_match in self.age_patterns[1].finditer(remove_blanks_text):
|
242 |
+
age = None
|
243 |
+
s,e = -1,-1
|
244 |
+
year = age_match.group(2)
|
245 |
+
if year is not None:
|
246 |
+
year = int(year)
|
247 |
+
month = age_match.group(4)
|
248 |
+
if month is not None:
|
249 |
+
month = int(month)
|
250 |
+
else:
|
251 |
+
month = 1
|
252 |
+
day = age_match.group(6)
|
253 |
+
if day is not None:
|
254 |
+
day = int(day)
|
255 |
+
else:
|
256 |
+
day = 1
|
257 |
+
age = date.today().year - year
|
258 |
+
if date.today().month<month or (date.today().month==month and date.today().day<day):
|
259 |
+
age -= 1
|
260 |
+
if age is not None:
|
261 |
+
s,e = age_match.span(1)
|
262 |
+
return_obj['age'].append({
|
263 |
+
'start': index_mapper[s],
|
264 |
+
'end': index_mapper[e-1]+1,
|
265 |
+
'text': str(age),
|
266 |
+
'entity': 'AGE',
|
267 |
+
'origin': text[index_mapper[s]:index_mapper[e-1]+1]
|
268 |
+
})
|
269 |
+
# 获取学校
|
270 |
+
for school_match in self.school_pattern.finditer(remove_blanks_text):
|
271 |
+
start,end = school_match.span()
|
272 |
+
expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
|
273 |
+
entities = self.pipelines['common'](expand_text)
|
274 |
+
for entity in entities:
|
275 |
+
if entity['entity']=="ORG" and self.school_pattern.search(entity['word']) is not None:
|
276 |
+
obj = {
|
277 |
+
'start': index_mapper[start+entity['start']],
|
278 |
+
'end': index_mapper[start+entity['end']-1]+1,
|
279 |
+
'entity': 'SCHOOL'
|
280 |
+
}
|
281 |
+
for school in self.schools:
|
282 |
+
if school in entity['word']:
|
283 |
+
obj['text'] = school
|
284 |
+
obj["level"] = self.schools[school]
|
285 |
+
break
|
286 |
+
repeat = False
|
287 |
+
for o in return_obj['schools']:
|
288 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
289 |
+
repeat = True
|
290 |
+
break
|
291 |
+
if not repeat:
|
292 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
293 |
+
return_obj['schools'].append(obj)
|
294 |
+
# 正则找学校
|
295 |
+
for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
|
296 |
+
start,end = school_match.span()
|
297 |
+
obj = {
|
298 |
+
'start': index_mapper[start],
|
299 |
+
'end': index_mapper[end-1]+1,
|
300 |
+
'entity': 'SCHOOL',
|
301 |
+
'text': school_match.group(),
|
302 |
+
}
|
303 |
+
repeat = False
|
304 |
+
for o in return_obj['schools']:
|
305 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
306 |
+
repeat = True
|
307 |
+
break
|
308 |
+
if not repeat:
|
309 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
310 |
+
obj['level'] = self.schools[obj['text']]
|
311 |
+
return_obj['schools'].append(obj)
|
312 |
+
# 获取学历
|
313 |
+
for i,pattern in enumerate(self.edu_patterns):
|
314 |
+
for edu_match in pattern.finditer(remove_blanks_text):
|
315 |
+
start,end = edu_match.span()
|
316 |
+
expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
|
317 |
+
entities = self.pipelines['common'](expand_text)
|
318 |
+
for entity in entities:
|
319 |
+
if entity['entity']=='EDU' and pattern.search(entity['word']) is not None:
|
320 |
+
obj = {
|
321 |
+
'start': index_mapper[start+entity['start']],
|
322 |
+
'end': index_mapper[start+entity['end']-1]+1,
|
323 |
+
'text': self.edu_index[i],
|
324 |
+
'entity': 'EDU',
|
325 |
+
}
|
326 |
+
repeat = False
|
327 |
+
for o in return_obj['edus']:
|
328 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
329 |
+
repeat = True
|
330 |
+
break
|
331 |
+
if not repeat:
|
332 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
333 |
+
return_obj['edus'].append(obj)
|
334 |
+
# 如果有工作经历
|
335 |
+
if self.works_key_pattern.search(remove_blanks_text) is not None:
|
336 |
+
for job_time_match in self.job_time_patterns.finditer(remove_blanks_text):
|
337 |
+
origin_start,origin_end = job_time_match.span()
|
338 |
+
# convert_to_date
|
339 |
+
fr = self.to_date(job_time_match.group(1))
|
340 |
+
fs,fe = job_time_match.span(1)
|
341 |
+
to = self.to_date(job_time_match.group(3))
|
342 |
+
ts,te = job_time_match.span(3)
|
343 |
+
expand_text, start, end = self.get_expand_span(remove_blanks_text, origin_start, origin_end, max_expand_length=100)
|
344 |
+
entities = self.pipelines['common'](expand_text)
|
345 |
+
objs = []
|
346 |
+
for entity in entities:
|
347 |
+
if entity['entity']=="ORG":
|
348 |
+
obj = {
|
349 |
+
'start': index_mapper[start+entity['start']],
|
350 |
+
'end': index_mapper[start+entity['end']-1]+1,
|
351 |
+
'entity': 'COMPANY',
|
352 |
+
'text': entity['word'],
|
353 |
+
'dis': min(
|
354 |
+
abs(origin_start-start-entity['end']+1),
|
355 |
+
abs(origin_end-start-entity['start'])
|
356 |
+
),
|
357 |
+
}
|
358 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
359 |
+
objs.append(obj)
|
360 |
+
objs.sort(key=lambda x:x['dis'])
|
361 |
+
if len(objs)>0 and self.school_pattern.search(objs[0]['text']) is None:
|
362 |
+
del objs[0]['dis']
|
363 |
+
from_date = {
|
364 |
+
'start': index_mapper[fs],
|
365 |
+
'end': index_mapper[fe-1]+1,
|
366 |
+
'text': fr.isoformat(),
|
367 |
+
'entity': 'DATE',
|
368 |
+
'origin': text[index_mapper[fs]:index_mapper[fe-1]+1]
|
369 |
+
}
|
370 |
+
to_date = {
|
371 |
+
'start': index_mapper[ts],
|
372 |
+
'end': index_mapper[te-1]+1,
|
373 |
+
'text': to.isoformat(),
|
374 |
+
'entity': 'DATE',
|
375 |
+
'origin': text[index_mapper[ts]:index_mapper[te-1]+1]
|
376 |
+
}
|
377 |
+
jobs = [objs[0],from_date,to_date]
|
378 |
+
return_obj['jobs'].append(jobs)
|
379 |
+
return_obj["jobs"].sort(key=lambda x:date.fromisoformat(x[1]['text']))
|
380 |
+
# 计算工作时间
|
381 |
+
last_end = None
|
382 |
+
work_month = 0
|
383 |
+
for i in range(0,len(return_obj["jobs"])):
|
384 |
+
start = date.fromisoformat(return_obj["jobs"][i][1]['text'])
|
385 |
+
end = date.fromisoformat(return_obj["jobs"][i][2]['text'])
|
386 |
+
if last_end is not None and start<last_end:
|
387 |
+
start = last_end
|
388 |
+
diff_y = end.year-start.year
|
389 |
+
diff_m = end.month-start.month
|
390 |
+
work_month += diff_y * 12 + diff_m
|
391 |
+
last_end = end
|
392 |
+
return_obj['work_time'] = math.ceil(work_month/12)
|
393 |
+
# 获取手机号码
|
394 |
+
for phone_match in self.phone_pattern.finditer(text):
|
395 |
+
start,end = phone_match.span()
|
396 |
+
return_obj['phone'].append({
|
397 |
+
'start': start,
|
398 |
+
'end': end,
|
399 |
+
'entity': 'PHONE',
|
400 |
+
'origin': text[start:end],
|
401 |
+
'text': re.sub('\s','',text[start:end])
|
402 |
+
})
|
403 |
+
for email_match in self.email_pattern.finditer(text):
|
404 |
+
start,end = email_match.span()
|
405 |
+
return_obj['email'].append({
|
406 |
+
'start': start,
|
407 |
+
'end': end,
|
408 |
+
'entity': 'EMAIL',
|
409 |
+
'origin': text[start:end],
|
410 |
+
'text': re.sub('\s','',text[start:end])
|
411 |
+
})
|
412 |
+
for gender_match in self.gender_pattern.finditer(text):
|
413 |
+
start,end = gender_match.span(2)
|
414 |
+
return_obj['gender'].append({
|
415 |
+
'start': start,
|
416 |
+
'end': end,
|
417 |
+
'entity': 'GENDER',
|
418 |
+
'word': text[start:end],
|
419 |
+
'text': text[start:end]
|
420 |
+
})
|
421 |
+
for block in self.split_to_blocks(remove_blanks_text):
|
422 |
+
entities = self.pipelines["common"](block["text"])
|
423 |
+
for entity in entities:
|
424 |
+
if entity['entity']=='TITLE':
|
425 |
+
obj = {
|
426 |
+
'start': index_mapper[block['start']+entity['start']],
|
427 |
+
'end': index_mapper[block['start']+entity['end']-1]+1,
|
428 |
+
'text': entity['word'],
|
429 |
+
'entity': 'TITLE',
|
430 |
+
}
|
431 |
+
obj['origin'] = text[obj['start']:obj['end']]
|
432 |
+
repeat = False
|
433 |
+
for o in return_obj['titles']:
|
434 |
+
if obj['start']==o['start'] and obj['end']==o['end']:
|
435 |
+
repeat = True
|
436 |
+
break
|
437 |
+
if not repeat:
|
438 |
+
return_obj['titles'].append(obj)
|
439 |
+
return return_obj
|
440 |
+
|
441 |
+
def __call__(self, *args: Any, **kwds: Any) -> Any:
|
442 |
+
return self.process(*args, **kwds)
|
443 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.30.1
|
2 |
+
gradio==3.36.1
|
3 |
+
huggingface-hub==0.15.1
|
4 |
+
torch==2.0.1
|
5 |
+
pandas==2.0.3
|
6 |
+
requests==2.31.0
|
7 |
+
xlrd==2.0.1
|