Spaces:
Runtime error
Runtime error
Sebastian Gehrmann
commited on
Commit
·
4f8648b
1
Parent(s):
e5b869b
Md formatting for the hub.
Browse files- formatting/construct_md.py +75 -0
- formatting/json_to_md.py +174 -18
formatting/construct_md.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
from json import load
|
3 |
+
|
4 |
+
def parse_args():
|
5 |
+
parser = ArgumentParser()
|
6 |
+
parser.add_argument('input', type=str, nargs='+', \
|
7 |
+
help='Specify paths to files (e.g., path/to/*.json)')
|
8 |
+
|
9 |
+
return parser.parse_args()
|
10 |
+
|
11 |
+
|
12 |
+
def json_to_markdown(filename):
|
13 |
+
json = load(open(filename))
|
14 |
+
|
15 |
+
markdown = f'# Dataset Card for {json["name"]}\n\n'
|
16 |
+
|
17 |
+
markdown += f'You can find the '
|
18 |
+
|
19 |
+
markdown += json['summary'] + '\n\n'
|
20 |
+
|
21 |
+
for key in json:
|
22 |
+
if key not in ('name', 'summary', 'sections'):
|
23 |
+
markdown += f'#### {key}\n{json[key]}\n\n'
|
24 |
+
|
25 |
+
markdown += '\n'.join(section_to_markdown(section) \
|
26 |
+
for section in json['sections'])
|
27 |
+
|
28 |
+
with open(f'{filename[:-5]}.md', 'w') as f:
|
29 |
+
f.write(markdown)
|
30 |
+
|
31 |
+
|
32 |
+
def section_to_markdown(section):
|
33 |
+
markdown = f'{"#" * section["level"]} {section["title"]}\n\n'
|
34 |
+
markdown += '\n'.join(subsection_to_markdown(subsection) \
|
35 |
+
for subsection in section['subsections'])
|
36 |
+
|
37 |
+
return markdown + '\n'
|
38 |
+
|
39 |
+
|
40 |
+
def subsection_to_markdown(subsection):
|
41 |
+
markdown = f'{"#" * subsection["level"]} {subsection["title"]}\n\n'
|
42 |
+
markdown += '\n'.join(field_to_markdown(field) \
|
43 |
+
for field in subsection['fields'])
|
44 |
+
|
45 |
+
return markdown + '\n'
|
46 |
+
|
47 |
+
|
48 |
+
def field_to_markdown(field):
|
49 |
+
markdown = f'{"#" * field["level"]} {field["title"]}\n\n'
|
50 |
+
|
51 |
+
if 'flags' in field and 'quick' in field['flags']:
|
52 |
+
markdown += f'<!-- quick -->\n'
|
53 |
+
|
54 |
+
if field.get('info', False):
|
55 |
+
markdown += f'<!-- info: {field["info"]} -->\n'
|
56 |
+
|
57 |
+
if field.get('scope', False):
|
58 |
+
markdown += f'<!-- scope: {field["scope"]} -->\n'
|
59 |
+
|
60 |
+
markdown += field.get('content', '')
|
61 |
+
|
62 |
+
return markdown + '\n'
|
63 |
+
|
64 |
+
|
65 |
+
def main():
|
66 |
+
"""Converts JSON output from `reformat_json.py`
|
67 |
+
to Markdown input for Data Cards Labs."""
|
68 |
+
args = parse_args()
|
69 |
+
for filename in args.input:
|
70 |
+
if filename[-5:] == '.json':
|
71 |
+
json_to_markdown(filename)
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == '__main__':
|
75 |
+
main()
|
formatting/json_to_md.py
CHANGED
@@ -1,18 +1,162 @@
|
|
1 |
from argparse import ArgumentParser
|
2 |
from json import load
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
def parse_args():
|
5 |
-
parser = ArgumentParser()
|
6 |
-
parser.add_argument('input', type=str, nargs='+', \
|
7 |
-
help='Specify paths to files (e.g., path/to/*.json)')
|
8 |
|
9 |
-
|
|
|
10 |
|
|
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
markdown
|
16 |
markdown += json['summary'] + '\n\n'
|
17 |
|
18 |
for key in json:
|
@@ -22,7 +166,9 @@ def json_to_markdown(filename):
|
|
22 |
markdown += '\n'.join(section_to_markdown(section) \
|
23 |
for section in json['sections'])
|
24 |
|
25 |
-
|
|
|
|
|
26 |
f.write(markdown)
|
27 |
|
28 |
|
@@ -59,14 +205,24 @@ def field_to_markdown(field):
|
|
59 |
return markdown + '\n'
|
60 |
|
61 |
|
62 |
-
def main():
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
|
|
70 |
|
71 |
-
|
72 |
-
|
|
|
1 |
from argparse import ArgumentParser
|
2 |
from json import load
|
3 |
+
import pathlib
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
def multi_grep(d, l1, l2, l3):
|
8 |
+
return d.get(l1, {}).get(l2, {}).get(l3, "[Needs More Information]")
|
9 |
+
|
10 |
+
def multi_grep2(d, l1, l2, l3):
|
11 |
+
return d.get(l1, {}).get(l2, {}).get(l3, ["unknown"])
|
12 |
+
|
13 |
+
def sanitize_md_url(s):
|
14 |
+
"""Strip out MD fragments if they exist."""
|
15 |
+
if len(s.split("](")) > 1:
|
16 |
+
return s.split("](")[1].replace(")", "")
|
17 |
+
else:
|
18 |
+
return s
|
19 |
+
|
20 |
+
# ---
|
21 |
+
# annotations_creators:
|
22 |
+
# - expert-generated
|
23 |
+
# language_creators:
|
24 |
+
# - found
|
25 |
+
# languages:
|
26 |
+
# - en
|
27 |
+
# licenses:
|
28 |
+
# - unknown
|
29 |
+
# multilinguality:
|
30 |
+
# - monolingual
|
31 |
+
# pretty_name: FairytaleQA
|
32 |
+
# size_categories:
|
33 |
+
# - 10K<n<100K
|
34 |
+
# source_datasets:
|
35 |
+
# - original
|
36 |
+
# task_categories:
|
37 |
+
# - question-generation
|
38 |
+
# task_ids:
|
39 |
+
# - abstractive-qg
|
40 |
+
# ---
|
41 |
+
|
42 |
+
def construct_preamble(data, name):
|
43 |
+
pre = "---\n"
|
44 |
+
pre += "annotations_creators:\n"
|
45 |
+
# - expert-generated
|
46 |
+
s = multi_grep(data, "curation", "annotations", "origin")
|
47 |
+
if s == "[Needs More Information]":
|
48 |
+
pre += "- unknown\n"
|
49 |
+
else:
|
50 |
+
pre += "- " + s.replace(" ", "-") + "\n"
|
51 |
+
|
52 |
+
pre += "language_creators:\n- unknown\n"
|
53 |
+
pre += "languages:"
|
54 |
+
languages = multi_grep2(data, "overview", "languages", "language_names")
|
55 |
+
for l in languages:
|
56 |
+
pre += f"\n- {l}"
|
57 |
+
pre += "\nlicenses:\n"
|
58 |
+
|
59 |
+
s = multi_grep(data, "overview", "languages", "license")
|
60 |
+
if s == "[Needs More Information]":
|
61 |
+
pre += "- unknown\n"
|
62 |
+
else:
|
63 |
+
pre += "- " + s.split(":")[0] + "\n"
|
64 |
+
|
65 |
+
pre += "multilinguality:\n"
|
66 |
+
if languages == ["unknown"]:
|
67 |
+
pre += "- unknown"
|
68 |
+
elif len(languages) == 1:
|
69 |
+
pre += "- monolingual"
|
70 |
+
else:
|
71 |
+
pre += "- multilingual"
|
72 |
+
|
73 |
+
# - monolingual
|
74 |
+
pre += f"\npretty_name: {name}\n"
|
75 |
+
pre += "size_categories:\n- unknown\n"
|
76 |
+
pre += "source_datasets:\n- original\n"
|
77 |
+
pre += "task_categories:\n"
|
78 |
+
|
79 |
+
s = multi_grep(data, "overview", "languages", "task")
|
80 |
+
if s == "[Needs More Information]":
|
81 |
+
pre += "- unknown\n"
|
82 |
+
else:
|
83 |
+
pre += "- " + "-".join(s.lower().split(" ")) + "\n"
|
84 |
+
# - question-generation
|
85 |
+
pre += "task_ids:\n- unknown\n"
|
86 |
+
# - abstractive-qg
|
87 |
+
|
88 |
+
pre += "---\n\n"
|
89 |
+
return pre
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
## Table of Contents
|
94 |
+
# - [Dataset Description](#dataset-description)
|
95 |
+
# - [Dataset Summary](#dataset-summary)
|
96 |
+
# - [Supported Tasks](#supported-tasks-and-leaderboards)
|
97 |
+
# - [Languages](#languages)
|
98 |
+
# - [Dataset Structure](#dataset-structure)
|
99 |
+
# - [Data Instances](#data-instances)
|
100 |
+
# - [Data Fields](#data-instances)
|
101 |
+
# - [Data Splits](#data-instances)
|
102 |
+
# - [Dataset Creation](#dataset-creation)
|
103 |
+
# - [Curation Rationale](#curation-rationale)
|
104 |
+
# - [Source Data](#source-data)
|
105 |
+
# - [Annotations](#annotations)
|
106 |
+
# - [Personal and Sensitive Information](#personal-and-sensitive-information)
|
107 |
+
# - [Considerations for Using the Data](#considerations-for-using-the-data)
|
108 |
+
# - [Social Impact of Dataset](#social-impact-of-dataset)
|
109 |
+
# - [Discussion of Biases](#discussion-of-biases)
|
110 |
+
# - [Other Known Limitations](#other-known-limitations)
|
111 |
+
# - [Additional Information](#additional-information)
|
112 |
+
# - [Dataset Curators](#dataset-curators)
|
113 |
+
# - [Licensing Information](#licensing-information)
|
114 |
+
# - [Citation Information](#citation-information)
|
115 |
+
|
116 |
+
def construct_toc(data):
|
117 |
+
pass
|
118 |
+
|
119 |
+
def construct_links(data):
|
120 |
+
|
121 |
+
links = "## Dataset Description\n\n"
|
122 |
+
|
123 |
+
s = sanitize_md_url(multi_grep(data, "overview", "where", "website"))
|
124 |
+
links += f"- **Homepage:** {s}\n"
|
125 |
+
|
126 |
+
s = sanitize_md_url(multi_grep(data, "overview", "where", "data-url"))
|
127 |
+
links += f"- **Repository:** {s}\n"
|
128 |
+
|
129 |
+
s = sanitize_md_url(multi_grep(data, "overview", "where", "paper-url"))
|
130 |
+
links += f"- **Paper:** {s}\n"
|
131 |
+
|
132 |
+
s = sanitize_md_url(multi_grep(data, "overview", "where", "leaderboard-url"))
|
133 |
+
links += f"- **Leaderboard:** {s}\n"
|
134 |
+
|
135 |
+
s = multi_grep(data, "overview", "where", "contact-name")
|
136 |
+
links += f"- **Point of Contact:** {s}\n\n"
|
137 |
+
|
138 |
+
return links
|
139 |
+
|
140 |
+
|
141 |
+
def json_to_markdown(filename, original_json_path):
|
142 |
+
json = load(open(filename))
|
143 |
+
original_json = load(open(original_json_path))
|
144 |
+
dataset_name = pathlib.Path(original_json_path).stem
|
145 |
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
preamble = construct_preamble(original_json, dataset_name)
|
148 |
+
markdown = preamble
|
149 |
|
150 |
+
markdown += f'# Dataset Card for GEM/{json["name"]}\n\n'
|
151 |
|
152 |
+
# ToC here.
|
153 |
+
|
154 |
+
markdown += construct_links(original_json)
|
155 |
+
|
156 |
+
markdown += "### Link to Main Data Card\n\n"
|
157 |
+
markdown += f'You can find the main data card on the [GEM Website](https://gem-benchmark.com/data_cards/{dataset_name}).\n\n'
|
158 |
|
159 |
+
markdown += "### Dataset Summary \n\n"
|
160 |
markdown += json['summary'] + '\n\n'
|
161 |
|
162 |
for key in json:
|
|
|
166 |
markdown += '\n'.join(section_to_markdown(section) \
|
167 |
for section in json['sections'])
|
168 |
|
169 |
+
readme_path = os.path.join(pathlib.Path(original_json_path).parents[0], "README.md")
|
170 |
+
|
171 |
+
with open(readme_path, 'w') as f:
|
172 |
f.write(markdown)
|
173 |
|
174 |
|
|
|
205 |
return markdown + '\n'
|
206 |
|
207 |
|
208 |
+
# def main():
|
209 |
+
# """Converts JSON output from `reformat_json.py`
|
210 |
+
# to Markdown input for Data Cards Labs."""
|
211 |
+
# args = parse_args()
|
212 |
+
# for filename in args.input:
|
213 |
+
# if filename[-5:] == '.json':
|
214 |
+
# json_to_markdown(filename)
|
215 |
+
|
216 |
+
if __name__ == "__main__":
|
217 |
+
|
218 |
+
for dataset in os.listdir("../../../GEMv2"):
|
219 |
+
data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json"
|
220 |
+
if os.path.exists(data_card_path):
|
221 |
+
print(f"Now processing {dataset}.")
|
222 |
+
# This script assumes you have run reformat_json.py
|
223 |
+
new_path = f"datacards/{dataset}.json"
|
224 |
|
225 |
+
md_string = json_to_markdown(new_path, data_card_path)
|
226 |
|
227 |
+
else:
|
228 |
+
print(f"{dataset} has no data card!")
|