Mirror / src /udi /check.py
Spico's picture
update
5953ef9
from rex.utils.io import load_jsonlines
def check_udi_instance(instance: dict):
assert isinstance(instance["id"], str)
assert isinstance(instance["instruction"], str)
assert isinstance(instance["schema"], dict)
for key in instance["schema"]:
assert key in ["cls", "ent", "rel", "event"]
if key in ["cls", "ent", "rel"]:
assert isinstance(instance["schema"][key], list) and all(
isinstance(x, str) for x in instance["schema"][key]
)
elif key == "event":
assert isinstance(instance["schema"][key], dict)
for event_type in instance["schema"][key]:
assert isinstance(instance["schema"][key][event_type], list) and all(
isinstance(x, str) for x in instance["schema"][key][event_type]
)
else:
raise ValueError
assert isinstance(instance["ans"], dict)
for key in instance["ans"]:
assert key in ["cls", "ent", "rel", "event", "span"]
if key == "cls":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, str) for x in instance["ans"][key]
)
elif key == "ent":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for ent in instance["ans"][key]:
assert (
isinstance(ent["type"], str)
and ent["type"] in instance["schema"]["ent"]
)
assert (
isinstance(ent["text"], str)
and instance["text"][ent["span"][0] : ent["span"][1]] == ent["text"]
)
assert (
isinstance(ent["span"], list)
and len(ent["span"]) == 2
and all(isinstance(x, int) for x in ent["span"])
)
elif key == "rel":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for rel in instance["ans"][key]:
assert (
isinstance(rel["relation"], str)
and rel["relation"] in instance["schema"]["rel"]
)
assert (
isinstance(rel["head"], dict)
and instance["text"][
rel["head"]["span"][0] : rel["head"]["span"][1]
]
== rel["head"]["text"]
)
assert (
isinstance(rel["tail"], dict)
and instance["text"][
rel["tail"]["span"][0] : rel["tail"]["span"][1]
]
== rel["tail"]["text"]
)
elif key == "event":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for event in instance["ans"][key]:
assert event["event_type"] in instance["schema"]["event"]
assert (
isinstance(event["trigger"], dict)
and event["trigger"]["text"] in instance["text"]
and instance["text"][
event["trigger"]["span"][0] : event["trigger"]["span"][1]
]
== event["trigger"]["text"]
)
for arg in event["args"]:
assert (
arg["role"] in instance["schema"]["event"][event["event_type"]]
)
assert (
isinstance(arg["text"], str)
and instance["text"][arg["span"][0] : arg["span"][1]]
== arg["text"]
)
elif key == "span":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for span in instance["ans"][key]:
assert (
isinstance(span["text"], str)
and instance["text"][span["span"][0] : span["span"][1]]
== span["text"]
)
else:
raise ValueError
assert isinstance(instance["text"], str)
assert isinstance(instance["bg"], str)
for key in ["ent", "rel", "event"]:
if instance["schema"].get(key):
assert len(instance["text"]) > 0
if "span" in instance["ans"]:
assert len(instance["text"]) > 0
assert instance["instruction"] or instance["text"] or instance["bg"]
def is_valid_udi_instance(instance: dict):
ok = True
try:
check_udi_instance(instance)
except:
ok = False
return ok
def main():
filepaths = []
for filepath in filepaths:
data = load_jsonlines(filepath)
data_ok = True
for ins in data:
ok = is_valid_udi_instance(ins)
if not ok:
data_ok = False
break
if not data_ok:
print(filepath)
if __name__ == "__main__":
main()