File size: 4,591 Bytes
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
 
 
 
 
 
b7fbd2a
 
 
8c3bdec
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
 
 
 
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
b7fbd2a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import argparse
import json
import pathlib
import os

parser = argparse.ArgumentParser(
    description="Format the output of the data card tool as .md for the hub."
)
parser.add_argument("--input_path", "-i", type=pathlib.Path, required=False)
parser.add_argument("--output_path", "-o", type=pathlib.Path, required=False)
args = parser.parse_args()


def read_json_file(json_path: pathlib.Path):
    """Load a json file and return it as object."""
    with open(json_path, "r") as f:
        data = json.load(f)
    return data


def save_file(json_path: pathlib.Path, json_obj: str):
    """Takes a string and saves it as .md file."""
    with open(json_path, "w") as f:
        f.write(json.dumps(json_obj, indent=2))


def construct_json(dataset_name: str, data_card_data: dict, text_by_key: dict):
  """Constructs the json file

  This function iterates through text_by_key and extracts all answers from
  the data_card_data object. It uses the levels of hierarchy as indicator for
  the heading indentation and does not change the order in which anything
  appears.

  Args:
      data_card_data: Output from the data card tool
      text_by_key: configuration defined in key_to_question.json

  Returns:
      data_card_md_string: json content
  """

  try:
    website_link = data_card_data["overview"]["where"]["website"]
  except KeyError:
    website_link = ""
  try:
    paper_link = data_card_data["overview"]["where"]["paper-url"]
  except KeyError:
    paper_link = ""
  try:
    authors = data_card_data["overview"]["credit"]["creators"]
  except KeyError:
    authors = ""
  try:
    summary = data_card_data["overview"]["what"]["dataset"]
  except KeyError:
    summary = "Placeholder"



  new_json = {
      "name": dataset_name,
      "summary": summary,
      "sections": [
      ],
  }

  if website_link:
    new_json["website"] = website_link
  if paper_link:
    new_json["paper"] = paper_link
  if authors:
    new_json["authors"] = authors


  total_questions = 0
  total_words = 0

  for main_key, main_content in text_by_key.items():
    l2_data = {
              "title": main_content["section-title"],
              "level": 2,
              "subsections": []
    }
    if main_key not in data_card_data:
      continue
    for second_key, second_content in main_content.items():
      if second_key == "section-title":
        continue
      l3_data = {
                      "title": second_content["section-title"],
                      "level": 3,
                      "fields": []
      }
      for final_key, final_content in second_content.items():
        if final_key == "section-title":
          continue
        try:
          total_questions += 1
          answer = data_card_data[main_key][second_key].get(final_key, "N/A")
        except:
          # print(main_key, second_key, final_key)
          # print("==="*50)
          # print(data_card_data)
          continue
        # Skip empty answers.
        if isinstance(answer, str):
          if answer.lower() == "n/a":
            continue
        if not answer:
          continue

        if isinstance(answer, list):
          answer = ", ".join([f"`{a}`" for a in answer])

        json_answer = {
          "title": final_content["title"],
          "level": 4,
          "content": answer,
          "flags": final_content["flags"],
          "info": final_content["info"],
          "scope": final_content["scope"],
        }
        total_words += len(answer.split())
        l3_data["fields"].append(json_answer)
      l2_data["subsections"].append(l3_data)
    new_json["sections"].append(l2_data)
  print(f"Total questions {total_questions}")
  print(f"total words: {total_words}")
  return new_json, total_words




if __name__ == "__main__":

  text_by_key = read_json_file(
      os.path.join(os.path.dirname(__file__), "key_to_question.json")
  )
  total_words_across_everything = 0
  for dataset in os.listdir("../../../GEMv2"):
    data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json"
    if os.path.exists(data_card_path):
      print(f"Now processing {dataset}.")
      new_path = f"datacards/{dataset}.json"
      data_card_data = read_json_file(data_card_path)
      data_card_json, total_cur_words = construct_json(dataset, data_card_data, text_by_key)
      total_words_across_everything += total_cur_words

      save_file(new_path, data_card_json)
    else:
      print(f"{dataset} has no data card!")
  print(total_words_across_everything)
  # data_card_json = construct_json(data_card_data, text_by_key)
  # save_file(args.output_path, data_card_json)