|
from numpy import string_ |
|
import re |
|
|
|
en_to_ar_camel = { |
|
'B-LOC' : 'مكان', |
|
'B-ORG': 'مؤسسة', |
|
'B-PERS': 'شخص', |
|
'B-MISC': 'معنى بموضوعات متنوعة', |
|
'I-LOC': 'مكان', |
|
'I-ORG': 'مؤسسة', |
|
'I-PERS': 'شحص', |
|
'I-MISC': 'معنى بموضوعات متنوعة', |
|
} |
|
|
|
en_to_ar = { |
|
"B-Artist" : "فنان", |
|
"I-Artist" :"فنان", |
|
"B-Sound": "صوت", |
|
"I-Sound":"صوت", |
|
"B-Educational": "تعليمي", |
|
"I-Educational":"تعليمي", |
|
"B-Building-Grounds":"أراضي البناء", |
|
"I-Building-Grounds":"أراضي البناء", |
|
"B-Population-Center":"مركز سكني", |
|
"B-Nation":"شعب(أمة)", |
|
"B-State-or-Province":"ولاية أو مقاطعة", |
|
"I-State-or-Province": "ولاية أو مقاطعة", |
|
"B-Water-Body": "مسطح مائي", |
|
"I-Water-Body":"مسطح مائي", |
|
"B-Land-Region-Natural": "أرض طبيعية", |
|
"I-Land-Region-Natural":"أرض طبيعية", |
|
"B-Software":"سوفتوير(برمجيات)", |
|
"I-Software":"سوفتوير(برمجيات)", |
|
"B-Scientist": "عالم", |
|
"B-Book":"كتاب", |
|
"I-Book":"كتاب", |
|
"I-Scientist":"عالم", |
|
"B-Group":"مجموعة", |
|
"B-Celestial":"سماوي", |
|
"B-Police":"شرطة", |
|
"I-Police":"شرطة", |
|
"I-Population-Center":"مركز سكني", |
|
"I-Celestial":"سماوي", |
|
"B-Engineer":"مهندس", |
|
"I-Engineer":"مهندس", |
|
"B-Projectile":"قذيفة", |
|
"B-Government":"حكومة", |
|
"I-Government":"حكومة", |
|
"B-Commercial":"تجاري", |
|
"I-Commercial":"تجاري", |
|
"B-Continent":"قارة", |
|
"B-Air":"هواء", |
|
"I-Air":"هواء", |
|
"B-Other_PER":"شخص", |
|
"I-Other_PER":"شخص", |
|
"I-Group":"مجموعة", |
|
"B-Politician":"سياسي", |
|
"I-Politician":"سياسي", |
|
"B-Athlete":"رياضي", |
|
"I-Athlete":"رياضي", |
|
"B-Religious_ORG":"مؤسسة دينية", |
|
"I-Religious_ORG":"مؤسسة دينية", |
|
"B-Path":"طريق", |
|
"I-Path":"طريق", |
|
"B-Media":"إعلام", |
|
"I-Media":"إعلام", |
|
"B-Non-Governmental":"غير حكومي", |
|
"I-Non-Governmental":"غير حكومي", |
|
"B-County-or-District":"مدينة أو ضاحية", |
|
"I-County-or-District":"مدينة أو ضاحية", |
|
"B-Businessperson":"رجل أعمال", |
|
"B-Lawyer":"محامي", |
|
"I-Lawyer":"محامي", |
|
"B-GPE-Cluster":"", |
|
"I-GPE-Cluster":"", |
|
"I-Nation":"شعب(أمة)", |
|
"B-Religious_PER":"شخص ديني", |
|
"I-Religious_PER":"شخص ديني", |
|
"I-Businessperson":"رجل أعمال", |
|
"B-Medical-Science":"علوم طبية", |
|
"I-Medical-Science":"علوم طبية", |
|
"B-Movie":"فيلم", |
|
"I-Movie":"فيلم", |
|
"B-Water":"ماء", |
|
"I-Water":"ماء", |
|
"B-Drug":"دواء", |
|
"B-Hardware":"عتاد", |
|
"I-Hardware":"عتاد", |
|
"B-Subarea-Facility":"منشأة منطقة فرعية", |
|
"I-Subarea-Facility":"منشأة منطقة فرعية", |
|
"B-Blunt":"فظ", |
|
"B-Airport":"مطار", |
|
"I-Blunt": "فظ", |
|
"I-Drug":"دواء", |
|
"B-Sports":"رياضة", |
|
"I-Sports":"رياضة", |
|
"B-Shooting":"رماية", |
|
"I-Shooting":"رماية", |
|
"B-Food":"طعام", |
|
"I-Food":"طعام", |
|
"I-Continent":"قارة", |
|
"B-Nuclear":"نووي", |
|
"I-Nuclear":"نووي", |
|
"B-Entertainment":"ترفيه", |
|
"I-Entertainment":"ترفيه", |
|
"I-Projectile":"قذيفة", |
|
"B-Land":"أرض", |
|
"B-Sharp":"حاد", |
|
"I-Airport":"مطار", |
|
"I-Land":"أرض", |
|
"B-Plant":"نبات", |
|
"I-Plant":"نبات", |
|
"B-Exploding":"منفجر", |
|
"I-Exploding":"منفجر", |
|
"B-Chemical":"كيميائي", |
|
"I-Chemical": "كيميائي", |
|
} |
|
|
|
|
|
|
|
|
|
def get_separate_entities(labels, tokens): |
|
""" |
|
takes labels and token , return full name entity (mohamed, salah --> "mohamed salah") |
|
this will be used to search in wikipedia |
|
""" |
|
res = [] |
|
b_before = False |
|
temp = "" |
|
key_value = () |
|
for i in range(len(labels)): |
|
print(res) |
|
curr = labels[i] |
|
|
|
if("B-" in curr): |
|
if(b_before): |
|
key_value = (temp[:-1], 1) |
|
res.append(key_value) |
|
temp = tokens[i] + ' ' |
|
else: |
|
b_before = True |
|
temp += tokens[i] + ' ' |
|
if(i == len(labels)-1): |
|
key_value = (temp[:-1], 1) |
|
res.append(key_value) |
|
|
|
|
|
elif("I-" in curr): |
|
temp += tokens[i] + ' ' |
|
if(i == len(labels)-1): |
|
key_value = (temp[:-1], 1) |
|
res.append(key_value) |
|
|
|
else: |
|
if(temp == ""): |
|
key_value = (tokens[i], 0) |
|
res.append(key_value) |
|
else: |
|
key_value = (temp[:-1], 1) |
|
res.append(key_value) |
|
key_value = (tokens[i], 0) |
|
res.append(key_value) |
|
temp = "" |
|
b_before = False |
|
|
|
|
|
|
|
print(res) |
|
return res |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|