File size: 5,289 Bytes
5953ef9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from rex.utils.io import load_jsonlines
def check_udi_instance(instance: dict):
assert isinstance(instance["id"], str)
assert isinstance(instance["instruction"], str)
assert isinstance(instance["schema"], dict)
for key in instance["schema"]:
assert key in ["cls", "ent", "rel", "event"]
if key in ["cls", "ent", "rel"]:
assert isinstance(instance["schema"][key], list) and all(
isinstance(x, str) for x in instance["schema"][key]
)
elif key == "event":
assert isinstance(instance["schema"][key], dict)
for event_type in instance["schema"][key]:
assert isinstance(instance["schema"][key][event_type], list) and all(
isinstance(x, str) for x in instance["schema"][key][event_type]
)
else:
raise ValueError
assert isinstance(instance["ans"], dict)
for key in instance["ans"]:
assert key in ["cls", "ent", "rel", "event", "span"]
if key == "cls":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, str) for x in instance["ans"][key]
)
elif key == "ent":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for ent in instance["ans"][key]:
assert (
isinstance(ent["type"], str)
and ent["type"] in instance["schema"]["ent"]
)
assert (
isinstance(ent["text"], str)
and instance["text"][ent["span"][0] : ent["span"][1]] == ent["text"]
)
assert (
isinstance(ent["span"], list)
and len(ent["span"]) == 2
and all(isinstance(x, int) for x in ent["span"])
)
elif key == "rel":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for rel in instance["ans"][key]:
assert (
isinstance(rel["relation"], str)
and rel["relation"] in instance["schema"]["rel"]
)
assert (
isinstance(rel["head"], dict)
and instance["text"][
rel["head"]["span"][0] : rel["head"]["span"][1]
]
== rel["head"]["text"]
)
assert (
isinstance(rel["tail"], dict)
and instance["text"][
rel["tail"]["span"][0] : rel["tail"]["span"][1]
]
== rel["tail"]["text"]
)
elif key == "event":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for event in instance["ans"][key]:
assert event["event_type"] in instance["schema"]["event"]
assert (
isinstance(event["trigger"], dict)
and event["trigger"]["text"] in instance["text"]
and instance["text"][
event["trigger"]["span"][0] : event["trigger"]["span"][1]
]
== event["trigger"]["text"]
)
for arg in event["args"]:
assert (
arg["role"] in instance["schema"]["event"][event["event_type"]]
)
assert (
isinstance(arg["text"], str)
and instance["text"][arg["span"][0] : arg["span"][1]]
== arg["text"]
)
elif key == "span":
assert isinstance(instance["ans"][key], list) and all(
isinstance(x, dict) for x in instance["ans"][key]
)
for span in instance["ans"][key]:
assert (
isinstance(span["text"], str)
and instance["text"][span["span"][0] : span["span"][1]]
== span["text"]
)
else:
raise ValueError
assert isinstance(instance["text"], str)
assert isinstance(instance["bg"], str)
for key in ["ent", "rel", "event"]:
if instance["schema"].get(key):
assert len(instance["text"]) > 0
if "span" in instance["ans"]:
assert len(instance["text"]) > 0
assert instance["instruction"] or instance["text"] or instance["bg"]
def is_valid_udi_instance(instance: dict):
ok = True
try:
check_udi_instance(instance)
except:
ok = False
return ok
def main():
filepaths = []
for filepath in filepaths:
data = load_jsonlines(filepath)
data_ok = True
for ins in data:
ok = is_valid_udi_instance(ins)
if not ok:
data_ok = False
break
if not data_ok:
print(filepath)
if __name__ == "__main__":
main()
|