File size: 13,342 Bytes
a256709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
"""
Code copied from AGXNet:
https://github.com/batmanlab/AGXNet
"""

import argparse
import pandas as pd
from tqdm import tqdm
import spacy

sp = spacy.load("en_core_web_sm")

parser = argparse.ArgumentParser(description="Pharse RadGraph Relations.")

parser.add_argument(
    "--input-path",
    default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-itemized.csv",
    help="Itemized input data path.",
)
parser.add_argument(
    "--output-path",
    default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-sentence-parsed.csv",
    help="Output path for parsed relations.",
)


def obs_lemmatization(x):
    """
    Lemmatize observation
    Args:
        x: a observation token
    Return:
        normalized observation
    """
    w_lst = []
    for word in sp(str(x)):
        w_lst.append(word.lemma_)
    return " ".join(w_lst)


def radgraph_parse(args):
    """Pharse RadGraph relations."""

    print("Loading itemized RadGraph data...")
    df_itemized = pd.read_csv(args.input_path)

    # get all study_id
    sid_lst = list(df_itemized["study_id"].unique())

    tuple_lst = []
    print("Preprocessing sentences...")
    for sid in tqdm(sid_lst):
        idx_s = df_itemized["study_id"] == sid
        df_sid = df_itemized[idx_s]

        # unique sentence index
        sen_ids = list(df_sid["sentence_ix"].unique())

        for si in sen_ids:
            idx_sen = df_sid["sentence_ix"] == si
            df_sen = df_sid[idx_sen]
            sen = df_sen["sentence"].iloc[0]

            # step 1, select all target anatomy entities (e.g., lobe) with label = ANAT-DP and target = NaN
            idx_a = (df_sen["label"] == "ANAT-DP") & (df_sen["target"].isnull())
            df_a = df_sen[idx_a]

            if sum(idx_a) > 0:
                for _, row_a in df_a.iterrows():
                    anatomy_source_keys = []
                    sen = row_a.sentence
                    source_key = row_a.source

                    # step 2, get detailed target anatomy (e.g., lower left lobe)
                    token_a = [row_a["token"].lower()]
                    anatomy_source_keys.append(source_key)
                    idx_t = (df_sen["label"] == "ANAT-DP") & (
                        df_sen["target"] == source_key
                    )
                    if sum(idx_t) > 0:
                        df_t = df_sen[idx_t]
                        for _, row in df_t.iterrows():
                            token_a += [row["token"].lower()]
                            anatomy_source_keys.append(
                                row["source"]
                            )  # save keys of all anatomy token, i.e., lower, left, lobe
                        anatomy = "|".join(token_a)

                    else:
                        anatomy = row_a["token"].lower()

                    # step 3: get observations associated with the target anatomy (e.g., normal, effusion)
                    idx_o = (
                        (df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"]))
                        & (df_sen["target"].isin(anatomy_source_keys))
                        & (df_sen["relation"] == "located_at")
                    )
                    if sum(idx_o) > 0:
                        df_o = df_sen[idx_o]

                        anatomy_lst = []
                        obs_lst = []
                        label_lst = []
                        obs_modify_lst = []
                        obs_suggestive_lst = []

                        for _, row_o in df_o.iterrows():
                            anatomy_lst.append(anatomy)
                            obs_lst.append(row_o["token"].lower())
                            label_lst.append(row_o["label"])

                            # step 4: get obs modification
                            idx_o_m = (df_sen["target"] == row_o.source) & (
                                df_sen["relation"] == "modify"
                            )
                            obs_modify = None
                            if sum(idx_o_m) > 0:
                                df_o_m = df_sen[idx_o_m]
                                temp_lst = []
                                for _, row_om in df_o_m.iterrows():
                                    # if the modification is present
                                    if row_om.label == "OBS-DP":
                                        temp_lst.append(row_om["token"].lower())
                                if len(temp_lst) > 0:
                                    obs_modify = "|".join(temp_lst)
                            obs_modify_lst.append(obs_modify)

                            # step 5: get suggestive of obs
                            idx_o_s = (df_sen["target"] == row_o.source) & (
                                df_sen["relation"] == "suggestive_of"
                            )
                            obs_suggestive = None
                            if sum(idx_o_s) > 0:
                                df_o_s = df_sen[idx_o_s]
                                temp_lst = []
                                for _, row_os in df_o_s.iterrows():
                                    # if the modification is present
                                    if row_os.label == "OBS-DP":
                                        temp_lst.append(row_os["token"].lower())
                                if len(temp_lst) > 0:
                                    obs_suggestive = "|".join(temp_lst)
                            obs_suggestive_lst.append(obs_suggestive)

                    else:
                        anatomy_lst = [anatomy]
                        obs_lst = [None]
                        label_lst = [None]
                        obs_modify_lst = [None]
                        obs_suggestive_lst = [None]

                    # step 4: get observations that are not associated with the target anatomy
                    idx_oo = (
                        (df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"]))
                        & (df_sen["target"].isna())
                        & (df_sen["relation"].isna())
                    )
                    if sum(idx_oo) > 0:
                        df_oo = df_sen[idx_oo]
                        for _, row_oo in df_oo.iterrows():
                            anatomy_lst.append("unspecified")
                            obs_lst.append(row_oo["token"].lower())
                            label_lst.append(row_oo["label"])
                            # obs_modify_lst.append(None)
                            # obs_suggestive_lst.append(None)

                            # step 5: get obs modification
                            idx_o_m = (df_sen["target"] == row_oo.source) & (
                                df_sen["relation"] == "modify"
                            )
                            obs_modify = None
                            if sum(idx_o_m) > 0:
                                df_o_m = df_sen[idx_o_m]
                                temp_lst = []
                                for _, row_om in df_o_m.iterrows():
                                    # if the modification is present
                                    if row_om.label == "OBS-DP":
                                        temp_lst.append(row_om["token"].lower())
                                if len(temp_lst) > 0:
                                    obs_modify = "|".join(temp_lst)
                            obs_modify_lst.append(obs_modify)

                            # step 5: get suggestive of obs
                            idx_o_s = (df_sen["target"] == row_oo.source) & (
                                df_sen["relation"] == "suggestive_of"
                            )
                            obs_suggestive = None
                            if sum(idx_o_s) > 0:
                                df_o_s = df_sen[idx_o_s]
                                temp_lst = []
                                for _, row_os in df_o_s.iterrows():
                                    # if the modification is present
                                    if row_os.label == "OBS-DP":
                                        temp_lst.append(row_os["token"].lower())
                                if len(temp_lst) > 0:
                                    obs_suggestive = "|".join(temp_lst)
                            obs_suggestive_lst.append(obs_suggestive)

                    # step 6: create tuple of 7 values (sid, sentence_id, sentence, anatomy, obs, label)
                    t_lst = []
                    for i in range(len(obs_lst)):
                        t_lst.append(
                            (
                                sid,
                                si,
                                sen,
                                anatomy_lst[i],
                                obs_lst[i],
                                label_lst[i],
                                obs_modify_lst[i],
                                obs_suggestive_lst[i],
                            )
                        )

                    # remove duplicates caused by 1 obs "located_at" multiple anatomies
                    tuple_lst.append(list(set(t_lst)))

            # if the sentence does not have any ANATOMY token
            else:
                idx_o = (df_sen["label"].isin(["OBS-DA", "OBS-DP", "OBS-U"])) & (
                    df_sen["target"].isnull()
                )
                if sum(idx_o) > 0:
                    df_o = df_sen[idx_o]

                    obs_lst = []
                    label_lst = []
                    obs_modify_lst = []
                    obs_suggestive_lst = []

                    for _, row_o in df_o.iterrows():
                        obs_lst.append(row_o["token"].lower())
                        label_lst.append(row_o["label"])

                        # step 4: get obs modification
                        idx_o_m = (df_sen["target"] == row_o.source) & (
                            df_sen["relation"] == "modify"
                        )
                        obs_modify = None
                        if sum(idx_o_m) > 0:
                            df_o_m = df_sen[idx_o_m]
                            temp_lst = []
                            for _, row_om in df_o_m.iterrows():
                                # if the modification is present
                                if row_om.label == "OBS-DP":
                                    temp_lst.append(row_om["token"].lower())
                            if len(temp_lst) > 0:
                                obs_modify = "|".join(temp_lst)
                        obs_modify_lst.append(obs_modify)

                        # step 5: get suggestive of obs
                        idx_o_s = (df_sen["target"] == row_o.source) & (
                            df_sen["relation"] == "suggestive_of"
                        )
                        obs_suggestive = None
                        if sum(idx_o_s) > 0:
                            df_o_s = df_sen[idx_o_s]
                            temp_lst = []
                            for _, row_os in df_o_s.iterrows():
                                # if the modification is present
                                if row_os.label == "OBS-DP":
                                    temp_lst.append(row_os["token"].lower())
                            if len(temp_lst) > 0:
                                obs_suggestive = "|".join(temp_lst)
                        obs_suggestive_lst.append(obs_suggestive)
                else:
                    obs_lst = [None]
                    label_lst = [None]
                    obs_modify_lst = [None]
                    obs_suggestive_lst = [None]

                # step 6: create tuple of 7 values (sid, sentence_id, sentence, anatomy, obs, label)
                t_lst = []
                for i in range(len(obs_lst)):
                    t_lst.append(
                        (
                            sid,
                            si,
                            sen,
                            "unspecified",
                            obs_lst[i],
                            label_lst[i],
                            obs_modify_lst[i],
                            obs_suggestive_lst[i],
                        )
                    )

                # remove duplicates if existing
                tuple_lst.append(list(set(t_lst)))

    # flatten nested list
    df_lst = [item for sublist in tuple_lst for item in sublist]
    df_anatomy_label = pd.DataFrame(
        df_lst,
        columns=[
            "study_id",
            "sen_id",
            "sentence",
            "anatomy",
            "observation",
            "label",
            "obs_modify",
            "obs_suggestive",
        ],
    )

    # lemmatize observation tokens (e.g., normalize opacities to opacity)
    obs_lemma_lst = []
    print("Lemmatizing observation tokens...")
    for t in tqdm(df_lst):
        obs = t[4]
        obs_lemma = obs_lemmatization(obs)
        obs_lemma_lst.append(obs_lemma)

    # save preprocessed sentence level data
    df_anatomy_label["obs_lemma"] = obs_lemma_lst
    df_anatomy_label.to_csv(args.output_path, index=False)
    print("Output file has been saved!")


if __name__ == "__main__":
    args = parser.parse_args()
    radgraph_parse(args)