File size: 3,759 Bytes
a256709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Code copied from AGXNet:
https://github.com/batmanlab/AGXNet
"""

import argparse
import pandas as pd
import json
from tqdm import tqdm
import nltk


parser = argparse.ArgumentParser(description="Itemize RadGraph Dataset.")

parser.add_argument(
    "--data-path",
    default="/PATH TO RADGRAPH DATA/RadGraph/physionet.org/files/radgraph/1.0.0/MIMIC-CXR_graphs.json",
    help="RadGraph data path.",
)
parser.add_argument(
    "--output-path",
    default="/PROJECT DIR/preprocessing/mimic-cxr-radgraph-itemized.csv",
    help="Output path for itemized RadGraph data.",
)


def get_ids(key):
    """Convert keys in the RadGraph file into IDs"""
    lst = key.split("/")
    partition = lst[0]  # dataset partition
    pid = lst[1][1:]  # patient id
    sid = lst[2].split(".")[0][1:]  # study id, remove .txt
    return partition, pid, sid


def get_sen_from_token_ix(text, ix):
    """get the sentence to which the input token index belongs."""
    sen_lst = nltk.sent_tokenize(text)
    dict_ws = {}
    ix_w = 0
    ix_s = 0
    for s in sen_lst:
        words = nltk.word_tokenize(s)
        for w in words:
            dict_ws[ix_w] = ix_s
            ix_w += 1
        ix_s += 1
    return dict_ws[ix], sen_lst[dict_ws[ix]]


def get_entity_relation(value):
    """itemize each relation"""
    source_lst = []
    target_lst = []
    token_lst = []
    token_ix_lst = []
    label_lst = []
    relation_lst = []
    sen_lst = []
    sen_ix_lst = []

    text = value["text"]

    entities = value["entities"]
    for k, v in entities.items():
        six, sen = get_sen_from_token_ix(text, v["start_ix"])
        relations = v["relations"]

        # source node has no out going edge
        if (len(relations) == 0) or (relations[0] is None):
            source_lst.append(k)
            token_ix_lst.append(v["start_ix"])
            token_lst.append(v["tokens"])
            label_lst.append(v["label"])
            relation_lst.append(None)
            target_lst.append(None)
            sen_ix_lst.append(six)
            sen_lst.append(sen)
        else:
            for r in relations:
                source_lst.append(k)
                token_ix_lst.append(v["start_ix"])
                token_lst.append(v["tokens"])
                label_lst.append(v["label"])
                relation_lst.append(r[0])
                target_lst.append(r[1])
                sen_ix_lst.append(six)
                sen_lst.append(sen)

    # save outputs in a dataframe
    return pd.DataFrame(
        {
            "source": source_lst,
            "token": token_lst,
            "token_ix": token_ix_lst,
            "label": label_lst,
            "relation": relation_lst,
            "target": target_lst,
            "sentence_ix": sen_ix_lst,
            "sentence": sen_lst,
        }
    )


def radgraph_itemize(args):
    """Convert nested RadGraph data to itemized examples."""

    print("Loading RadGraph data...")
    f = open(args.data_path)
    data = json.load(f)
    print("RadGraph data is loaded.")

    # create itemized RadGraph data
    df_lst = []
    pid_lst = []
    sid_lst = []
    text_lst = []
    print("Itemizing RadGraph data...")
    for key, value in tqdm(data.items()):
        _, pid, sid = get_ids(key)
        pid_lst.append(pid)
        sid_lst.append(sid)
        text_lst.append(data[key]["text"])
        df = get_entity_relation(value)
        df["subject_id"] = pid
        df["study_id"] = sid
        df_lst.append(df)

    # entity level dataframe
    df_itemized = pd.concat(df_lst)

    # save dataframes to a .csv file
    df_itemized.to_csv(args.output_path, index=False)
    print("Outputs have been saved!")


if __name__ == "__main__":
    args = parser.parse_args()
    radgraph_itemize(args)