|
""" |
|
Code copied from AGXNet: |
|
https://github.com/batmanlab/AGXNet |
|
""" |
|
|
|
import argparse |
|
import pandas as pd |
|
from tqdm import tqdm |
|
import spacy |
|
|
|
sp = spacy.load("en_core_web_sm") |
|
|
|
parser = argparse.ArgumentParser(description="Pharse RadGraph Relations.") |
|
|
|
parser.add_argument( |
|
"--input-path", |
|
default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-itemized.csv", |
|
help="Itemized input data path.", |
|
) |
|
parser.add_argument( |
|
"--output-path", |
|
default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-sentence-parsed.csv", |
|
help="Output path for parsed relations.", |
|
) |
|
|
|
|
|
def obs_lemmatization(x): |
|
""" |
|
Lemmatize observation |
|
Args: |
|
x: a observation token |
|
Return: |
|
normalized observation |
|
""" |
|
w_lst = [] |
|
for word in sp(str(x)): |
|
w_lst.append(word.lemma_) |
|
return " ".join(w_lst) |
|
|
|
|
|
def radgraph_parse(args): |
|
"""Pharse RadGraph relations.""" |
|
|
|
print("Loading itemized RadGraph data...") |
|
df_itemized = pd.read_csv(args.input_path) |
|
|
|
|
|
sid_lst = list(df_itemized["study_id"].unique()) |
|
|
|
tuple_lst = [] |
|
print("Preprocessing sentences...") |
|
for sid in tqdm(sid_lst): |
|
idx_s = df_itemized["study_id"] == sid |
|
df_sid = df_itemized[idx_s] |
|
|
|
|
|
sen_ids = list(df_sid["sentence_ix"].unique()) |
|
|
|
for si in sen_ids: |
|
idx_sen = df_sid["sentence_ix"] == si |
|
df_sen = df_sid[idx_sen] |
|
sen = df_sen["sentence"].iloc[0] |
|
|
|
|
|
idx_a = (df_sen["label"] == "ANAT-DP") & (df_sen["target"].isnull()) |
|
df_a = df_sen[idx_a] |
|
|
|
if sum(idx_a) > 0: |
|
for _, row_a in df_a.iterrows(): |
|
anatomy_source_keys = [] |
|
sen = row_a.sentence |
|
source_key = row_a.source |
|
|
|
|
|
token_a = [row_a["token"].lower()] |
|
anatomy_source_keys.append(source_key) |
|
idx_t = (df_sen["label"] == "ANAT-DP") & ( |
|
df_sen["target"] == source_key |
|
) |
|
if sum(idx_t) > 0: |
|
df_t = df_sen[idx_t] |
|
for _, row in df_t.iterrows(): |
|
token_a += [row["token"].lower()] |
|
anatomy_source_keys.append( |
|
row["source"] |
|
) |
|
anatomy = "|".join(token_a) |
|
|
|
else: |
|
anatomy = row_a["token"].lower() |
|
|
|
|
|
idx_o = ( |
|
(df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"])) |
|
& (df_sen["target"].isin(anatomy_source_keys)) |
|
& (df_sen["relation"] == "located_at") |
|
) |
|
if sum(idx_o) > 0: |
|
df_o = df_sen[idx_o] |
|
|
|
anatomy_lst = [] |
|
obs_lst = [] |
|
label_lst = [] |
|
obs_modify_lst = [] |
|
obs_suggestive_lst = [] |
|
|
|
for _, row_o in df_o.iterrows(): |
|
anatomy_lst.append(anatomy) |
|
obs_lst.append(row_o["token"].lower()) |
|
label_lst.append(row_o["label"]) |
|
|
|
|
|
idx_o_m = (df_sen["target"] == row_o.source) & ( |
|
df_sen["relation"] == "modify" |
|
) |
|
obs_modify = None |
|
if sum(idx_o_m) > 0: |
|
df_o_m = df_sen[idx_o_m] |
|
temp_lst = [] |
|
for _, row_om in df_o_m.iterrows(): |
|
|
|
if row_om.label == "OBS-DP": |
|
temp_lst.append(row_om["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_modify = "|".join(temp_lst) |
|
obs_modify_lst.append(obs_modify) |
|
|
|
|
|
idx_o_s = (df_sen["target"] == row_o.source) & ( |
|
df_sen["relation"] == "suggestive_of" |
|
) |
|
obs_suggestive = None |
|
if sum(idx_o_s) > 0: |
|
df_o_s = df_sen[idx_o_s] |
|
temp_lst = [] |
|
for _, row_os in df_o_s.iterrows(): |
|
|
|
if row_os.label == "OBS-DP": |
|
temp_lst.append(row_os["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_suggestive = "|".join(temp_lst) |
|
obs_suggestive_lst.append(obs_suggestive) |
|
|
|
else: |
|
anatomy_lst = [anatomy] |
|
obs_lst = [None] |
|
label_lst = [None] |
|
obs_modify_lst = [None] |
|
obs_suggestive_lst = [None] |
|
|
|
|
|
idx_oo = ( |
|
(df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"])) |
|
& (df_sen["target"].isna()) |
|
& (df_sen["relation"].isna()) |
|
) |
|
if sum(idx_oo) > 0: |
|
df_oo = df_sen[idx_oo] |
|
for _, row_oo in df_oo.iterrows(): |
|
anatomy_lst.append("unspecified") |
|
obs_lst.append(row_oo["token"].lower()) |
|
label_lst.append(row_oo["label"]) |
|
|
|
|
|
|
|
|
|
idx_o_m = (df_sen["target"] == row_oo.source) & ( |
|
df_sen["relation"] == "modify" |
|
) |
|
obs_modify = None |
|
if sum(idx_o_m) > 0: |
|
df_o_m = df_sen[idx_o_m] |
|
temp_lst = [] |
|
for _, row_om in df_o_m.iterrows(): |
|
|
|
if row_om.label == "OBS-DP": |
|
temp_lst.append(row_om["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_modify = "|".join(temp_lst) |
|
obs_modify_lst.append(obs_modify) |
|
|
|
|
|
idx_o_s = (df_sen["target"] == row_oo.source) & ( |
|
df_sen["relation"] == "suggestive_of" |
|
) |
|
obs_suggestive = None |
|
if sum(idx_o_s) > 0: |
|
df_o_s = df_sen[idx_o_s] |
|
temp_lst = [] |
|
for _, row_os in df_o_s.iterrows(): |
|
|
|
if row_os.label == "OBS-DP": |
|
temp_lst.append(row_os["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_suggestive = "|".join(temp_lst) |
|
obs_suggestive_lst.append(obs_suggestive) |
|
|
|
|
|
t_lst = [] |
|
for i in range(len(obs_lst)): |
|
t_lst.append( |
|
( |
|
sid, |
|
si, |
|
sen, |
|
anatomy_lst[i], |
|
obs_lst[i], |
|
label_lst[i], |
|
obs_modify_lst[i], |
|
obs_suggestive_lst[i], |
|
) |
|
) |
|
|
|
|
|
tuple_lst.append(list(set(t_lst))) |
|
|
|
|
|
else: |
|
idx_o = (df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"])) & ( |
|
df_sen["target"].isnull() |
|
) |
|
if sum(idx_o) > 0: |
|
df_o = df_sen[idx_o] |
|
|
|
obs_lst = [] |
|
label_lst = [] |
|
obs_modify_lst = [] |
|
obs_suggestive_lst = [] |
|
|
|
for _, row_o in df_o.iterrows(): |
|
obs_lst.append(row_o["token"].lower()) |
|
label_lst.append(row_o["label"]) |
|
|
|
|
|
idx_o_m = (df_sen["target"] == row_o.source) & ( |
|
df_sen["relation"] == "modify" |
|
) |
|
obs_modify = None |
|
if sum(idx_o_m) > 0: |
|
df_o_m = df_sen[idx_o_m] |
|
temp_lst = [] |
|
for _, row_om in df_o_m.iterrows(): |
|
|
|
if row_om.label == "OBS-DP": |
|
temp_lst.append(row_om["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_modify = "|".join(temp_lst) |
|
obs_modify_lst.append(obs_modify) |
|
|
|
|
|
idx_o_s = (df_sen["target"] == row_o.source) & ( |
|
df_sen["relation"] == "suggestive_of" |
|
) |
|
obs_suggestive = None |
|
if sum(idx_o_s) > 0: |
|
df_o_s = df_sen[idx_o_s] |
|
temp_lst = [] |
|
for _, row_os in df_o_s.iterrows(): |
|
|
|
if row_os.label == "OBS-DP": |
|
temp_lst.append(row_os["token"].lower()) |
|
if len(temp_lst) > 0: |
|
obs_suggestive = "|".join(temp_lst) |
|
obs_suggestive_lst.append(obs_suggestive) |
|
else: |
|
obs_lst = [None] |
|
label_lst = [None] |
|
obs_modify_lst = [None] |
|
obs_suggestive_lst = [None] |
|
|
|
|
|
t_lst = [] |
|
for i in range(len(obs_lst)): |
|
t_lst.append( |
|
( |
|
sid, |
|
si, |
|
sen, |
|
"unspecified", |
|
obs_lst[i], |
|
label_lst[i], |
|
obs_modify_lst[i], |
|
obs_suggestive_lst[i], |
|
) |
|
) |
|
|
|
|
|
tuple_lst.append(list(set(t_lst))) |
|
|
|
|
|
df_lst = [item for sublist in tuple_lst for item in sublist] |
|
df_anatomy_label = pd.DataFrame( |
|
df_lst, |
|
columns=[ |
|
"study_id", |
|
"sen_id", |
|
"sentence", |
|
"anatomy", |
|
"observation", |
|
"label", |
|
"obs_modify", |
|
"obs_suggestive", |
|
], |
|
) |
|
|
|
|
|
obs_lemma_lst = [] |
|
print("Lemmatizing observation tokens...") |
|
for t in tqdm(df_lst): |
|
obs = t[4] |
|
obs_lemma = obs_lemmatization(obs) |
|
obs_lemma_lst.append(obs_lemma) |
|
|
|
|
|
df_anatomy_label["obs_lemma"] = obs_lemma_lst |
|
df_anatomy_label.to_csv(args.output_path, index=False) |
|
print("Output file has been saved!") |
|
|
|
|
|
if __name__ == "__main__": |
|
args = parser.parse_args() |
|
radgraph_parse(args) |
|
|