minskiter's picture
fix(predictor): fix age error
fddeeda
raw
history blame
22.6 kB
from typing import Any, Dict,List
from transformers import Pipeline
import requests
import re
from io import BytesIO
import pandas as pd
import math
import queue
from datetime import date
import time
import logging
class Predictor():
def __init__(
self,
pipelines: Dict[str, Pipeline] = {},
paths: List[str] = [],
today: date = date.today()
) -> None:
if "name" not in pipelines:
raise ValueError("'name' pipeline is None")
if "common" not in pipelines:
raise ValueError("'common' pipeline is None")
self.pipelines = pipelines
self.today = today
self.logger = logging.getLogger(__name__)
self.__init_split_data()
self.__init_schools_data(paths)
self.__init_patterns()
def __init_patterns(
self
):
last_name = r"[赵,钱,孙,李,周,吴,郑,王,冯,陈,楮,卫,蒋,沈,韩,杨,朱,秦,尤,许,何,吕,施,张,孔,曹,严,华,金,魏,陶,姜,戚,谢,邹,喻,"\
+r"柏,水,窦,章,云,苏,潘,葛,奚,范,彭,郎,鲁,韦,昌,马,苗,凤,花,方,俞,任,袁,柳,酆,鲍,史,唐,费,廉,岑,薛,雷,贺,倪,汤,滕,殷,罗," \
+ r"毕,郝,邬,安,常,乐,于,时,傅,皮,卞,齐,康,伍,余,元,卜,顾,孟,平,黄,和,穆,萧,尹,姚,邵,湛,汪,祁,毛,禹,狄,米,贝,明,臧,计,伏,成,戴,谈,宋,茅," \
+ r"庞,熊,纪,舒,屈,项,祝,董,梁,杜,阮,蓝,闽,席,季,麻,强,贾,路,娄,危,江,童,颜,郭,梅,盛,林,刁,锺,徐,丘,骆,高,夏,蔡,田,樊,胡,凌,霍,虞,万,支," \
+ r"柯,昝,管,卢,莫,经,房,裘,缪,干,解,应,宗,丁,宣,贲,邓,郁,单,杭,洪,包,诸,左,石,崔,吉,钮,龚,程,嵇,邢,滑,裴,陆,荣,翁,荀,羊,於,惠,甄,麹,家," \
+ r"封,芮,羿,储,靳,汲,邴,糜,松,井,段,富,巫,乌,焦,巴,弓,牧,隗,山,谷,车,侯,宓,蓬,全,郗,班,仰,秋,仲,伊,宫,宁,仇,栾,暴,甘,斜,厉,戎,祖,武,符," \
+ r"刘,景,詹,束,龙,叶,幸,司,韶,郜,黎,蓟,薄,印,宿,白,怀,蒲,邰,从,鄂,索,咸,籍,赖,卓,蔺,屠,蒙,池,乔,阴,郁,胥,能,苍,双,闻,莘,党,翟,谭,贡,劳," \
+ r"逄,姬,申,扶,堵,冉,宰,郦,雍,郤,璩,桑,桂,濮,牛,寿,通,边,扈,燕,冀,郏,浦,尚,农,温,别,庄,晏,柴,瞿,阎,充,慕,连,茹,习,宦,艾,鱼,容,向,古,易," \
+ r"慎,戈,廖,庾,终,暨,居,衡,步,都,耿,满,弘,匡,国,文,寇,广,禄,阙,东,欧,殳,沃,利,蔚,越,夔,隆,师,巩,厍,聂,晁,勾,敖,融,冷,訾,辛,阚,那,简,饶," \
+ r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
+ r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
+ r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
+ r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱]"
first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
self.name_pattern = re.compile(last_name + first_name)
self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
self.email_pattern = re.compile(r'([a-zA-Z0-9_-] {0,4})+@([a-zA-Z0-9_-] {0,4})+(\. {0,4}([a-zA-Z0-9_-] {0,4})+)+')
self.gender_pattern = re.compile(r'(性 {0,8}别.*?)?\s*?(男|女)')
self.age_patterns = [
re.compile(r"(\d{1,2})岁|年龄.{0,10}?(\d{1,2})"),
re.compile(r"生.{0,12}(([12]\d{3})[年|.]?(([01]?\d)[月|.]?)?(([0-3]?\d)[日|.]?)?)"),
]
self.works_key_pattern = re.compile("工作|experience|work",re.M|re.I)
self.job_time_patterns = re.compile('([1-2]\d{3}(\D?[01]?\d){0,2})\D?([1-2]\d{3}(\D?[01]?\d){0,2}|至今)')
self.edu_index = ["博士","硕士","研究生","学士","本科","大专","专科","中专","高中","初中","小学"]
self.edu_patterns = list(re.compile(i) for i in self.edu_index)
self.school_pattern = re.compile(r"([a-zA-Z0-9 \u4e00-\u9fa5]{1,18}(学院|大学|中学|小学|学校|Unverisity|College))")
def _is_url(self, path: str):
return path.startswith('http://') or path.startswith('https://')
def __init_schools_data(
self,
paths: List[str],
):
schools = {}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
}
for path in paths:
stream = None
if self._is_url(path):
res = requests.get(path,headers=headers)
if res.status_code==200:
stream = BytesIO(res.content)
else:
with open(path, 'rb') as f:
stream = BytesIO(f.read())
df = pd.read_excel(stream)
for row in df.iterrows():
if isinstance(row[1][1],float) and math.isnan(row[1][1]):
continue
if row[1][1]=='学校名称':
continue
# [学校] = 学历(本科、专科)
if len(row[1])>5:
schools[row[1][1]] = row[1][5]
else:
schools[row[1][1]] = "成人学校"
self.schools = schools
if len(schools)==0:
raise ValueError("学校数据为空")
def __init_split_data(
self
):
self.splits = {'\\', '_', '"', '%', '{', '《', ')', '$', '(', '\n', '~', '*', ':', '!', ';', '”', '’', '\t', '?', '-', ';', '》', '】', '`', '、', '+', '“', '[', '—', '·', ')', '=', '‘', '}', '?', ',', '&', '@', '#', ']', '——', ' ', '.', '【', "'", '>', ',', '/', ':', '。', '...', '^', '(', '<', '|', '……', '!'}
def to_date(self, datestr:str):
if re.match("^\d{4}$",datestr):
return date(int(datestr),1,1)
match = re.match("^\d{4}(\D)\d{1,2}",datestr)
if match is not None:
try:
m = min(max(int(datestr.split(match.group(1))[1]),1),12)
return date(int(datestr.split(match.group(1))[0]),m,1)
except ValueError:
print(int(datestr.split(match.group(1))[0]),int(datestr.split(match.group(1))[1]))
raise
if datestr=="至今":
return self.today
return None
def split_to_blocks(
self,
text: str,
max_block_len: int = 510,
overlap: bool = True,
max_overlap_len: int = 20,
):
block = {
"start": -1,
"end": -1,
"text": "",
}
blocks = []
overlap_end = queue.Queue()
for i in range(len(text)):
if text[i] in self.splits:
if block["start"]==-1:
continue
if block["end"]!=-1 and i-block['start']>=max_block_len:
block["text"] = text[block["start"]:block["end"]]
blocks.append(block)
block = {
"start": overlap_end.queue[0]+1 if overlap else block['end']+1,
"end": -1,
"text": "",
}
block["end"] = i
while overlap_end.qsize()>0 and overlap_end.queue[0]+max_overlap_len<=i:
overlap_end.get()
overlap_end.put(i)
else:
if block["start"]==-1:
block["start"] = i
# last block
if block["start"]!=-1:
block["end"] = len(text)
block["text"] = text[block["start"]:block["end"]]
blocks.append(block)
return blocks
def get_expand_span(
self,
text: str,
start: int,
end: int,
max_expand_length=10,
):
expand_l,expand_r = start,end
for l in range(max(start-max_expand_length,0), start):
if text[l] in self.splits:
expand_l = l+1
break
for r in range(min(end+max_expand_length,len(text)-1), end, -1):
if text[r] in self.splits:
expand_r = r
break
return text[expand_l:expand_r], expand_l, expand_r
def remove_blanks(
self,
text: str,
blank_pattern: re.Pattern,
):
index_mapper = {}
new_text = []
for i in range(len(text)):
if blank_pattern.match(text[i]) is not None:
continue
index_mapper[len(new_text)] = i
new_text.append(text[i])
return ''.join(new_text), index_mapper
def process(self, text)->Dict[str, Any]:
return_obj = {
"name": [],
"age": [],
"gender": [],
"phone": [],
"email": [],
"schools": [],
"work_time": 0,
"edus": [],
"jobs": [],
"titles": []
}
# 获取名字,先过滤所有空白字符,防止名字中间有空格
remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
start_time = time.perf_counter()
for block in self.split_to_blocks(remove_blanks_text):
block_text,block_l = block['text'],block['start']
entities = self.pipelines['name'](block_text)
for entity in entities:
if entity['entity']=='NAME' and self.name_pattern.match(entity['word']) is not None:
obj = {
'start': index_mapper[block_l+entity['start']],
'end': index_mapper[block_l+entity['end']-1]+1,
'entity': 'NAME',
'text': entity['word']
}
repeat = False
for o in return_obj['name']:
if obj['start']==o['start'] and obj['end']==o['end']:
repeat = True
break
if not repeat:
obj['origin'] = text[obj['start']:obj['end']]
return_obj['name'].append(obj)
end_time = time.perf_counter()
self.logger.info(f"process name time: {end_time-start_time}")
# 获取年龄
start_time = time.perf_counter()
for age_match in self.age_patterns[0].finditer(remove_blanks_text):
age = None
s,e = -1,-1
if age_match.group(1) is not None:
age = age_match.group(1)
s,e = age_match.span(1)
elif age_match.group(2) is not None:
age = age_match.group(2)
s,e = age_match.span(2)
if age is not None:
return_obj['age'].append({
'start': index_mapper[s],
'end': index_mapper[e-1]+1,
'text': str(age),
'entity': 'AGE',
'origin': text[index_mapper[s]:index_mapper[e-1]+1]
})
for age_match in self.age_patterns[1].finditer(remove_blanks_text):
age = None
s,e = -1,-1
year = age_match.group(2)
if year is not None:
year = int(year)
month = age_match.group(4)
if month is not None:
month = int(month)
else:
month = 1
day = age_match.group(6)
if day is not None:
day = int(day)
else:
day = 1
age = date.today().year - year
if date.today().month<month or (date.today().month==month and date.today().day<day):
age -= 1
if age is not None:
s,e = age_match.span(1)
return_obj['age'].append({
'start': index_mapper[s],
'end': index_mapper[e-1]+1,
'text': str(age),
'entity': 'AGE',
'origin': text[index_mapper[s]:index_mapper[e-1]+1]
})
end_time = time.perf_counter()
self.logger.info(f"process age time: {end_time-start_time}")
start_time = time.perf_counter()
# 获取学校
for school_match in self.school_pattern.finditer(remove_blanks_text):
start,end = school_match.span()
expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
entities = self.pipelines['common'](expand_text)
for entity in entities:
if entity['entity']=="ORG" and self.school_pattern.search(entity['word']) is not None:
obj = {
'start': index_mapper[start+entity['start']],
'end': index_mapper[start+entity['end']-1]+1,
'entity': 'SCHOOL'
}
for school in self.schools:
if school in entity['word']:
obj['text'] = school
obj["level"] = self.schools[school]
break
repeat = False
for o in return_obj['schools']:
if obj['start']==o['start'] and obj['end']==o['end']:
repeat = True
break
if not repeat:
obj['origin'] = text[obj['start']:obj['end']]
return_obj['schools'].append(obj)
# 正则找学校
for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
start,end = school_match.span()
obj = {
'start': index_mapper[start],
'end': index_mapper[end-1]+1,
'entity': 'SCHOOL',
'text': school_match.group(),
}
repeat = False
for o in return_obj['schools']:
if obj['start']==o['start'] and obj['end']==o['end']:
repeat = True
break
if not repeat:
obj['origin'] = text[obj['start']:obj['end']]
obj['level'] = self.schools[obj['text']]
return_obj['schools'].append(obj)
end_time = time.perf_counter()
self.logger.info(f"process school time: {end_time-start_time}")
start_time = time.perf_counter()
# 获取学历
for i,pattern in enumerate(self.edu_patterns):
for edu_match in pattern.finditer(remove_blanks_text):
start,end = edu_match.span()
expand_text, start, end = self.get_expand_span(remove_blanks_text, start, end)
entities = self.pipelines['common'](expand_text)
for entity in entities:
if entity['entity']=='EDU' and pattern.search(entity['word']) is not None:
obj = {
'start': index_mapper[start+entity['start']],
'end': index_mapper[start+entity['end']-1]+1,
'text': self.edu_index[i],
'entity': 'EDU',
}
repeat = False
for o in return_obj['edus']:
if obj['start']==o['start'] and obj['end']==o['end']:
repeat = True
break
if not repeat:
obj['origin'] = text[obj['start']:obj['end']]
return_obj['edus'].append(obj)
end_time = time.perf_counter()
self.logger.info(f"process edu time: {end_time-start_time}")
start_time = time.perf_counter()
# 如果有工作经历
if self.works_key_pattern.search(remove_blanks_text) is not None:
for job_time_match in self.job_time_patterns.finditer(remove_blanks_text):
origin_start,origin_end = job_time_match.span()
# convert_to_date
fr = self.to_date(job_time_match.group(1))
if fr is None:
continue
fs,fe = job_time_match.span(1)
to = self.to_date(job_time_match.group(3))
if to is None:
continue
ts,te = job_time_match.span(3)
expand_text, start, end = self.get_expand_span(remove_blanks_text, origin_start, origin_end, max_expand_length=50)
entities = self.pipelines['common'](expand_text)
objs = []
for entity in entities:
if entity['entity']=="ORG":
obj = {
'start': index_mapper[start+entity['start']],
'end': index_mapper[start+entity['end']-1]+1,
'entity': 'COMPANY',
'text': entity['word'],
'dis': min(
abs(origin_start-start-entity['end']+1),
abs(origin_end-start-entity['start'])
),
}
obj['origin'] = text[obj['start']:obj['end']]
objs.append(obj)
objs.sort(key=lambda x:x['dis'])
if len(objs)>0 and self.school_pattern.search(objs[0]['text']) is None:
del objs[0]['dis']
from_date = {
'start': index_mapper[fs],
'end': index_mapper[fe-1]+1,
'text': fr.isoformat(),
'entity': 'DATE',
'origin': text[index_mapper[fs]:index_mapper[fe-1]+1]
}
to_date = {
'start': index_mapper[ts],
'end': index_mapper[te-1]+1,
'text': to.isoformat(),
'entity': 'DATE',
'origin': text[index_mapper[ts]:index_mapper[te-1]+1]
}
jobs = [objs[0],from_date,to_date]
return_obj['jobs'].append(jobs)
return_obj["jobs"].sort(key=lambda x:date.fromisoformat(x[1]['text']))
# 计算工作时间
last_end = None
work_month = 0
for i in range(0,len(return_obj["jobs"])):
start = date.fromisoformat(return_obj["jobs"][i][1]['text'])
end = date.fromisoformat(return_obj["jobs"][i][2]['text'])
if last_end is not None and start<last_end:
start = last_end
diff_y = end.year-start.year
diff_m = end.month-start.month
work_month += diff_y * 12 + diff_m
last_end = end
return_obj['work_time'] = math.ceil(work_month/12)
end_time = time.perf_counter()
self.logger.info(f"process work time: {end_time-start_time}")
start_time = time.perf_counter()
# 获取手机号码
for phone_match in self.phone_pattern.finditer(text):
start,end = phone_match.span()
return_obj['phone'].append({
'start': start,
'end': end,
'entity': 'PHONE',
'origin': text[start:end],
'text': re.sub('\s','',text[start:end])
})
end_time = time.perf_counter()
self.logger.info(f"process phone time: {end_time-start_time}")
start_time = time.perf_counter()
for email_match in self.email_pattern.finditer(text):
start,end = email_match.span()
return_obj['email'].append({
'start': start,
'end': end,
'entity': 'EMAIL',
'origin': text[start:end],
'text': re.sub('\s','',text[start:end])
})
end_time = time.perf_counter()
self.logger.info(f"process email time: {end_time-start_time}")
start_time = time.perf_counter()
for gender_match in self.gender_pattern.finditer(text):
start,end = gender_match.span(2)
return_obj['gender'].append({
'start': start,
'end': end,
'entity': 'GENDER',
'word': text[start:end],
'text': text[start:end]
})
end_time = time.perf_counter()
self.logger.info(f"process gender time: {end_time-start_time}")
start_time = time.perf_counter()
for block in self.split_to_blocks(remove_blanks_text):
entities = self.pipelines["common"](block["text"])
for entity in entities:
if entity['entity']=='TITLE':
obj = {
'start': index_mapper[block['start']+entity['start']],
'end': index_mapper[block['start']+entity['end']-1]+1,
'text': entity['word'],
'entity': 'TITLE',
}
obj['origin'] = text[obj['start']:obj['end']]
repeat = False
for o in return_obj['titles']:
if obj['start']==o['start'] and obj['end']==o['end']:
repeat = True
break
if not repeat:
return_obj['titles'].append(obj)
end_time = time.perf_counter()
self.logger.info(f"process title time: {end_time-start_time}")
return return_obj
def __call__(self, *args: Any, **kwds: Any) -> Any:
return self.process(*args, **kwds)