from numpy import string_ import re en_to_ar_camel = { 'B-LOC' : 'مكان', 'B-ORG': 'مؤسسة', 'B-PERS': 'شخص', 'B-MISC': 'معنى بموضوعات متنوعة', 'I-LOC': 'مكان', 'I-ORG': 'مؤسسة', 'I-PERS': 'شحص', 'I-MISC': 'معنى بموضوعات متنوعة', } en_to_ar = { "B-Artist" : "فنان", "I-Artist" :"فنان", "B-Sound": "صوت", "I-Sound":"صوت", "B-Educational": "تعليمي", "I-Educational":"تعليمي", "B-Building-Grounds":"أراضي البناء", "I-Building-Grounds":"أراضي البناء", "B-Population-Center":"مركز سكني", "B-Nation":"شعب(أمة)", "B-State-or-Province":"ولاية أو مقاطعة", "I-State-or-Province": "ولاية أو مقاطعة", "B-Water-Body": "مسطح مائي", "I-Water-Body":"مسطح مائي", "B-Land-Region-Natural": "أرض طبيعية", "I-Land-Region-Natural":"أرض طبيعية", "B-Software":"سوفتوير(برمجيات)", "I-Software":"سوفتوير(برمجيات)", "B-Scientist": "عالم", "B-Book":"كتاب", "I-Book":"كتاب", "I-Scientist":"عالم", "B-Group":"مجموعة", "B-Celestial":"سماوي", "B-Police":"شرطة", "I-Police":"شرطة", "I-Population-Center":"مركز سكني", "I-Celestial":"سماوي", "B-Engineer":"مهندس", "I-Engineer":"مهندس", "B-Projectile":"قذيفة", "B-Government":"حكومة", "I-Government":"حكومة", "B-Commercial":"تجاري", "I-Commercial":"تجاري", "B-Continent":"قارة", "B-Air":"هواء", "I-Air":"هواء", "B-Other_PER":"شخص", "I-Other_PER":"شخص", "I-Group":"مجموعة", "B-Politician":"سياسي", "I-Politician":"سياسي", "B-Athlete":"رياضي", "I-Athlete":"رياضي", "B-Religious_ORG":"مؤسسة دينية", "I-Religious_ORG":"مؤسسة دينية", "B-Path":"طريق", "I-Path":"طريق", "B-Media":"إعلام", "I-Media":"إعلام", "B-Non-Governmental":"غير حكومي", "I-Non-Governmental":"غير حكومي", "B-County-or-District":"مدينة أو ضاحية", "I-County-or-District":"مدينة أو ضاحية", "B-Businessperson":"رجل أعمال", "B-Lawyer":"محامي", "I-Lawyer":"محامي", "B-GPE-Cluster":"", "I-GPE-Cluster":"", "I-Nation":"شعب(أمة)", "B-Religious_PER":"شخص ديني", "I-Religious_PER":"شخص ديني", "I-Businessperson":"رجل أعمال", "B-Medical-Science":"علوم طبية", "I-Medical-Science":"علوم طبية", "B-Movie":"فيلم", "I-Movie":"فيلم", "B-Water":"ماء", "I-Water":"ماء", "B-Drug":"دواء", "B-Hardware":"عتاد", "I-Hardware":"عتاد", "B-Subarea-Facility":"منشأة منطقة فرعية", "I-Subarea-Facility":"منشأة منطقة فرعية", "B-Blunt":"فظ", "B-Airport":"مطار", "I-Blunt": "فظ", "I-Drug":"دواء", "B-Sports":"رياضة", "I-Sports":"رياضة", "B-Shooting":"رماية", "I-Shooting":"رماية", "B-Food":"طعام", "I-Food":"طعام", "I-Continent":"قارة", "B-Nuclear":"نووي", "I-Nuclear":"نووي", "B-Entertainment":"ترفيه", "I-Entertainment":"ترفيه", "I-Projectile":"قذيفة", "B-Land":"أرض", "B-Sharp":"حاد", "I-Airport":"مطار", "I-Land":"أرض", "B-Plant":"نبات", "I-Plant":"نبات", "B-Exploding":"منفجر", "I-Exploding":"منفجر", "B-Chemical":"كيميائي", "I-Chemical": "كيميائي", } def get_separate_entities(labels, tokens): """ takes labels and token , return full name entity (mohamed, salah --> "mohamed salah") this will be used to search in wikipedia """ res = [] b_before = False temp = "" key_value = () for i in range(len(labels)): print(res) curr = labels[i] if("B-" in curr): if(b_before): key_value = (temp[:-1], 1) res.append(key_value) temp = tokens[i] + ' ' else: b_before = True temp += tokens[i] + ' ' if(i == len(labels)-1): key_value = (temp[:-1], 1) res.append(key_value) # print("temp is:" + str(temp)) elif("I-" in curr): temp += tokens[i] + ' ' if(i == len(labels)-1): key_value = (temp[:-1], 1) res.append(key_value) else: if(temp == ""): key_value = (tokens[i], 0) res.append(key_value) else: key_value = (temp[:-1], 1) res.append(key_value) key_value = (tokens[i], 0) res.append(key_value) temp = "" b_before = False print(res) return res