smostafanejad commited on
Commit
070c7da
·
verified ·
1 Parent(s): 23f0058

Upload know_your_data.ipynb

Browse files
Files changed (1) hide show
  1. know_your_data.ipynb +161 -323
know_your_data.ipynb CHANGED
@@ -1,389 +1,227 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "2493a0c3-4b27-496a-9514-32fb4941c94e",
7
  "metadata": {
8
  "tags": []
9
  },
10
  "outputs": [],
11
  "source": [
12
- "import datasets\n",
13
- "from datasets import ( \n",
14
- " get_dataset_config_info,\n",
15
- " get_dataset_config_names,\n",
16
- " get_dataset_split_names,\n",
17
- " get_dataset_default_config_name, \n",
18
- " load_dataset\n",
19
  ")\n",
20
- "from pprint import pprint"
 
 
 
 
 
 
 
 
 
21
  ]
22
  },
23
  {
24
  "cell_type": "code",
25
- "execution_count": 2,
26
  "id": "c4892171-f79f-4eee-99db-a21d11b09e5c",
27
  "metadata": {
28
  "tags": []
29
  },
30
- "outputs": [
31
- {
32
- "data": {
33
- "application/vnd.jupyter.widget-view+json": {
34
- "model_id": "ba47ec20fe404e01942074c02dce13af",
35
- "version_major": 2,
36
- "version_minor": 0
37
- },
38
- "text/plain": [
39
- "Downloading builder script: 0%| | 0.00/55.7k [00:00<?, ?B/s]"
40
- ]
41
- },
42
- "metadata": {},
43
- "output_type": "display_data"
44
- },
45
- {
46
- "data": {
47
- "application/vnd.jupyter.widget-view+json": {
48
- "model_id": "f65e2b756dd34c0fbf7a64f87431ba7d",
49
- "version_major": 2,
50
- "version_minor": 0
51
- },
52
- "text/plain": [
53
- "Downloading readme: 0%| | 0.00/16.7k [00:00<?, ?B/s]"
54
- ]
55
- },
56
- "metadata": {},
57
- "output_type": "display_data"
58
- }
59
- ],
60
  "source": [
61
- "ds_hub = load_dataset(path=\"molssiai-hub/pubchemqc-b3lyp\",\n",
62
- " name=\"b3lyp_pm6\",\n",
63
- " split=\"train\",\n",
64
- " streaming=True,\n",
65
- " trust_remote_code=True)"
 
 
 
66
  ]
67
  },
68
  {
69
- "cell_type": "code",
70
- "execution_count": 3,
71
- "id": "3334886b-abac-455e-a098-2a281ed3d4d5",
72
- "metadata": {
73
- "tags": []
74
- },
75
- "outputs": [
76
- {
77
- "data": {
78
- "text/plain": [
79
- "['cid',\n",
80
- " 'state',\n",
81
- " 'pubchem-inchi',\n",
82
- " 'pubchem-charge',\n",
83
- " 'pubchem-version',\n",
84
- " 'name',\n",
85
- " 'coordinates',\n",
86
- " 'atomic-numbers',\n",
87
- " 'atom-count',\n",
88
- " 'heavy-atom-count',\n",
89
- " 'core-electrons',\n",
90
- " 'bond-order',\n",
91
- " 'connection-indices',\n",
92
- " 'formula',\n",
93
- " 'version',\n",
94
- " 'obabel-inchi',\n",
95
- " 'pm6-obabel-canonical-smiles',\n",
96
- " 'charge',\n",
97
- " 'energy-beta-gap',\n",
98
- " 'energy-beta-homo',\n",
99
- " 'energy-beta-lumo',\n",
100
- " 'energy-alpha-gap',\n",
101
- " 'energy-alpha-homo',\n",
102
- " 'energy-alpha-lumo',\n",
103
- " 'total-energy',\n",
104
- " 'homos',\n",
105
- " 'orbital-energies',\n",
106
- " 'mo-count',\n",
107
- " 'basis-count',\n",
108
- " 'multiplicity',\n",
109
- " 'molecular-mass',\n",
110
- " 'number-of-atoms',\n",
111
- " 'lowdin-partial-charges',\n",
112
- " 'mulliken-partial-charges',\n",
113
- " 'dipole-moment',\n",
114
- " 'pubchem-multiplicity',\n",
115
- " 'pubchem-obabel-canonical-smiles',\n",
116
- " 'pubchem-isomeric-smiles',\n",
117
- " 'pubchem-molecular-weight',\n",
118
- " 'pubchem-molecular-formula']"
119
- ]
120
- },
121
- "execution_count": 3,
122
- "metadata": {},
123
- "output_type": "execute_result"
124
- }
125
- ],
126
  "source": [
127
- "ds_hub.column_names"
 
 
 
 
 
 
128
  ]
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": 4,
133
- "id": "bd6618bf-6c00-450e-ad8b-6c39bc47383c",
134
- "metadata": {
135
- "tags": []
136
- },
137
- "outputs": [
138
- {
139
- "data": {
140
- "text/plain": [
141
- "{'cid': Value(dtype='int64', id=None),\n",
142
- " 'state': Value(dtype='string', id=None),\n",
143
- " 'pubchem-inchi': Value(dtype='string', id=None),\n",
144
- " 'pubchem-charge': Value(dtype='int64', id=None),\n",
145
- " 'pubchem-version': Value(dtype='string', id=None),\n",
146
- " 'name': Value(dtype='string', id=None),\n",
147
- " 'coordinates': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),\n",
148
- " 'atomic-numbers': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),\n",
149
- " 'atom-count': Value(dtype='int64', id=None),\n",
150
- " 'heavy-atom-count': Value(dtype='int64', id=None),\n",
151
- " 'core-electrons': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),\n",
152
- " 'bond-order': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),\n",
153
- " 'connection-indices': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),\n",
154
- " 'formula': Value(dtype='string', id=None),\n",
155
- " 'version': Value(dtype='string', id=None),\n",
156
- " 'obabel-inchi': Value(dtype='string', id=None),\n",
157
- " 'pm6-obabel-canonical-smiles': Value(dtype='string', id=None),\n",
158
- " 'charge': Value(dtype='int64', id=None),\n",
159
- " 'energy-beta-gap': Value(dtype='float64', id=None),\n",
160
- " 'energy-beta-homo': Value(dtype='float64', id=None),\n",
161
- " 'energy-beta-lumo': Value(dtype='float64', id=None),\n",
162
- " 'energy-alpha-gap': Value(dtype='float64', id=None),\n",
163
- " 'energy-alpha-homo': Value(dtype='float64', id=None),\n",
164
- " 'energy-alpha-lumo': Value(dtype='float64', id=None),\n",
165
- " 'total-energy': Value(dtype='float64', id=None),\n",
166
- " 'homos': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),\n",
167
- " 'orbital-energies': Sequence(feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), length=-1, id=None),\n",
168
- " 'mo-count': Value(dtype='int64', id=None),\n",
169
- " 'basis-count': Value(dtype='int64', id=None),\n",
170
- " 'multiplicity': Value(dtype='int64', id=None),\n",
171
- " 'molecular-mass': Value(dtype='float64', id=None),\n",
172
- " 'number-of-atoms': Value(dtype='int64', id=None),\n",
173
- " 'lowdin-partial-charges': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),\n",
174
- " 'mulliken-partial-charges': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),\n",
175
- " 'dipole-moment': Value(dtype='float64', id=None),\n",
176
- " 'pubchem-multiplicity': Value(dtype='int64', id=None),\n",
177
- " 'pubchem-obabel-canonical-smiles': Value(dtype='string', id=None),\n",
178
- " 'pubchem-isomeric-smiles': Value(dtype='string', id=None),\n",
179
- " 'pubchem-molecular-weight': Value(dtype='float64', id=None),\n",
180
- " 'pubchem-molecular-formula': Value(dtype='string', id=None)}"
181
- ]
182
- },
183
- "execution_count": 4,
184
- "metadata": {},
185
- "output_type": "execute_result"
186
- }
187
- ],
188
  "source": [
189
- "ds_hub.features"
 
 
 
 
 
 
 
 
190
  ]
191
  },
192
  {
193
- "cell_type": "code",
194
- "execution_count": 5,
195
- "id": "359a7334-70ed-4724-88c2-44fd01fa7162",
196
- "metadata": {
197
- "tags": []
198
- },
199
- "outputs": [
200
- {
201
- "name": "stdout",
202
- "output_type": "stream",
203
- "text": [
204
- "('The presented “PubChemQC B3LYP/6-31G*//PM6” data set is composed of the '\n",
205
- " 'electronic properties of 85,938,443 molecules, encompassing a broad spectrum '\n",
206
- " 'of molecules from essential compounds to biomolecules with a molecular '\n",
207
- " 'weight up to 1000. These molecules account for 94.0% of the original PubChem '\n",
208
- " 'Compound catalog as of August 29, 2016. The electronic properties, including '\n",
209
- " 'orbitals, orbital energies, total energies, dipole moments, and other '\n",
210
- " 'pertinent properties, were computed by using the B3LYP/6-31G* and PM6 '\n",
211
- " 'methods.\\n')\n"
212
- ]
213
- }
214
- ],
215
  "source": [
216
- "pprint(ds_hub.description, width=80)"
 
 
 
 
217
  ]
218
  },
219
  {
220
  "cell_type": "code",
221
- "execution_count": 7,
222
- "id": "aafb0ae2-8896-49d4-81e6-99ee39d5720b",
223
- "metadata": {
224
- "tags": []
225
- },
226
  "outputs": [],
227
  "source": [
228
- "path = \"molssiai-hub/pubchemqc-b3lyp\""
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  ]
230
  },
231
  {
232
  "cell_type": "code",
233
- "execution_count": 8,
234
- "id": "29fc1e0e-fc9e-45f6-8cdd-6fa1c2368a29",
235
- "metadata": {
236
- "tags": []
237
- },
238
- "outputs": [
239
- {
240
- "name": "stdout",
241
- "output_type": "stream",
242
- "text": [
243
- "('@article{Nakata2023,\\n'\n",
244
- " 'abstract = {The presented “PubChemQC B3LYP/6-31G*//PM6” data set is composed '\n",
245
- " 'of the electronic properties of 85,938,443 molecules, encompassing a broad '\n",
246
- " 'spectrum of molecules from essential compounds to biomolecules with a '\n",
247
- " 'molecular weight up to 1000. These molecules account for 94.0% of the '\n",
248
- " 'original PubChem Compound catalog as of August 29, 2016. The electronic '\n",
249
- " 'properties, including orbitals, orbital energies, total energies, dipole '\n",
250
- " 'moments, and other pertinent properties, were computed by using the '\n",
251
- " 'B3LYP/6-31G* and PM6 methods. The data set, available in three formats, '\n",
252
- " 'namely, GAMESS quantum chemistry program files, selected JSON output files, '\n",
253
- " 'and a PostgreSQL database, provides researchers with the ability to query '\n",
254
- " 'molecular properties. It is further subdivided into five subdata sets for '\n",
255
- " 'more specific data. The first two subsets encompass molecules with carbon, '\n",
256
- " 'hydrogen, oxygen, and nitrogen with molecular weights under 300 and 500, '\n",
257
- " 'respectively. The third and fourth subsets incorporate molecules with '\n",
258
- " 'carbon, hydrogen, nitrogen, oxygen, phosphorus, sulfur, fluorine, and '\n",
259
- " 'chlorine, with molecular weights under 300 and 500, respectively. The fifth '\n",
260
- " 'subset comprises molecules with carbon, hydrogen, nitrogen, oxygen, '\n",
261
- " 'phosphorus, sulfur, fluorine, chlorine, sodium, potassium, magnesium, and '\n",
262
- " 'calcium, with a molecular weight of under 500. The coefficients of '\n",
263
- " 'determination for the highest occupied molecular orbital-lowest unoccupied '\n",
264
- " 'molecular orbital energy gap range from 0.892 (for CHON500) to 0.803 (for '\n",
265
- " 'the whole data set). These comprehensive results pave the way for '\n",
266
- " 'applications in drug discovery and materials science, among others. The data '\n",
267
- " 'sets can be accessed under the Creative Commons Attribution 4.0 '\n",
268
- " 'International license at the following web address: '\n",
269
- " 'https://nakatamaho.riken.jp/pubchemqc.riken.jp/b3lyp_pm6_datasets.html.},\\n'\n",
270
- " 'author = {Maho Nakata and Toshiyuki Maeda},\\n'\n",
271
- " 'doi = {10.1021/ACS.JCIM.3C00899/ASSET/IMAGES/MEDIUM/CI3C00899_0012.GIF},\\n'\n",
272
- " 'issn = {1549960X},\\n'\n",
273
- " 'issue = {18},\\n'\n",
274
- " 'journal = {Journal of Chemical Information and Modeling},\\n'\n",
275
- " 'month = {9},\\n'\n",
276
- " 'pages = {5734-5754},\\n'\n",
277
- " 'pmid = {37677147},\\n'\n",
278
- " 'publisher = {American Chemical Society},\\n'\n",
279
- " 'title = {PubChemQC B3LYP/6-31G*//PM6 Data Set: The Electronic Structures of '\n",
280
- " '86 Million Molecules Using B3LYP/6-31G* Calculations},\\n'\n",
281
- " 'volume = {63},\\n'\n",
282
- " 'url = {https://pubs.acs.org/doi/abs/10.1021/acs.jcim.3c00899},\\n'\n",
283
- " 'year = {2023},\\n'\n",
284
- " '}\\n')\n"
285
- ]
286
- }
287
- ],
288
  "source": [
289
- "pprint(get_dataset_config_info(path=path).citation, width=80)"
 
 
 
 
 
290
  ]
291
  },
292
  {
293
  "cell_type": "code",
294
- "execution_count": 9,
295
- "id": "64067367-3b8d-49fb-b6b4-047a1d737c25",
296
- "metadata": {
297
- "tags": []
298
- },
299
- "outputs": [
300
- {
301
- "data": {
302
- "text/plain": [
303
- "['b3lyp_pm6',\n",
304
- " 'b3lyp_pm6_chon300nosalt',\n",
305
- " 'b3lyp_pm6_chon500nosalt',\n",
306
- " 'b3lyp_pm6_chnopsfcl300nosalt',\n",
307
- " 'b3lyp_pm6_chnopsfcl500nosalt',\n",
308
- " 'b3lyp_pm6_chnopsfclnakmgca500']"
309
- ]
310
- },
311
- "execution_count": 9,
312
- "metadata": {},
313
- "output_type": "execute_result"
314
- }
315
- ],
316
  "source": [
317
- "get_dataset_config_names(path)"
 
 
 
 
 
318
  ]
319
  },
320
  {
321
  "cell_type": "code",
322
- "execution_count": 10,
323
- "id": "4ce893fb-59a4-4de7-9d28-0b7f575bbdd4",
324
- "metadata": {
325
- "tags": []
326
- },
327
- "outputs": [
328
- {
329
- "data": {
330
- "text/plain": [
331
- "['train']"
332
- ]
333
- },
334
- "execution_count": 10,
335
- "metadata": {},
336
- "output_type": "execute_result"
337
- }
338
- ],
339
  "source": [
340
- "get_dataset_split_names(path)"
341
  ]
342
  },
343
  {
344
  "cell_type": "code",
345
- "execution_count": 11,
346
- "id": "5e0d35c5-df4c-412b-8ca9-af95f69dbf24",
347
- "metadata": {
348
- "tags": []
349
- },
350
- "outputs": [
351
- {
352
- "data": {
353
- "text/plain": [
354
- "'b3lyp_pm6'"
355
- ]
356
- },
357
- "execution_count": 11,
358
- "metadata": {},
359
- "output_type": "execute_result"
360
- }
361
- ],
362
  "source": [
363
- "get_dataset_default_config_name(path)"
 
 
 
 
 
 
 
 
 
 
 
 
364
  ]
365
  },
366
  {
367
  "cell_type": "code",
368
- "execution_count": 12,
369
- "id": "8c38c671-5ac8-442b-a56d-315a8446ec2c",
370
- "metadata": {
371
- "tags": []
372
- },
373
- "outputs": [
374
- {
375
- "data": {
376
- "text/plain": [
377
- "DatasetInfo(description='The presented “PubChemQC B3LYP/6-31G*//PM6” data set is composed of the electronic properties of 85,938,443 molecules, encompassing a broad spectrum of molecules from essential compounds to biomolecules with a molecular weight up to 1000. These molecules account for 94.0% of the original PubChem Compound catalog as of August 29, 2016. The electronic properties, including orbitals, orbital energies, total energies, dipole moments, and other pertinent properties, were computed by using the B3LYP/6-31G* and PM6 methods.\\n', citation='@article{Nakata2023,\\nabstract = {The presented “PubChemQC B3LYP/6-31G*//PM6” data set is composed of the electronic properties of 85,938,443 molecules, encompassing a broad spectrum of molecules from essential compounds to biomolecules with a molecular weight up to 1000. These molecules account for 94.0% of the original PubChem Compound catalog as of August 29, 2016. The electronic properties, including orbitals, orbital energies, total energies, dipole moments, and other pertinent properties, were computed by using the B3LYP/6-31G* and PM6 methods. The data set, available in three formats, namely, GAMESS quantum chemistry program files, selected JSON output files, and a PostgreSQL database, provides researchers with the ability to query molecular properties. It is further subdivided into five subdata sets for more specific data. The first two subsets encompass molecules with carbon, hydrogen, oxygen, and nitrogen with molecular weights under 300 and 500, respectively. The third and fourth subsets incorporate molecules with carbon, hydrogen, nitrogen, oxygen, phosphorus, sulfur, fluorine, and chlorine, with molecular weights under 300 and 500, respectively. The fifth subset comprises molecules with carbon, hydrogen, nitrogen, oxygen, phosphorus, sulfur, fluorine, chlorine, sodium, potassium, magnesium, and calcium, with a molecular weight of under 500. The coefficients of determination for the highest occupied molecular orbital-lowest unoccupied molecular orbital energy gap range from 0.892 (for CHON500) to 0.803 (for the whole data set). These comprehensive results pave the way for applications in drug discovery and materials science, among others. The data sets can be accessed under the Creative Commons Attribution 4.0 International license at the following web address: https://nakatamaho.riken.jp/pubchemqc.riken.jp/b3lyp_pm6_datasets.html.},\\nauthor = {Maho Nakata and Toshiyuki Maeda},\\ndoi = {10.1021/ACS.JCIM.3C00899/ASSET/IMAGES/MEDIUM/CI3C00899_0012.GIF},\\nissn = {1549960X},\\nissue = {18},\\njournal = {Journal of Chemical Information and Modeling},\\nmonth = {9},\\npages = {5734-5754},\\npmid = {37677147},\\npublisher = {American Chemical Society},\\ntitle = {PubChemQC B3LYP/6-31G*//PM6 Data Set: The Electronic Structures of 86 Million Molecules Using B3LYP/6-31G* Calculations},\\nvolume = {63},\\nurl = {https://pubs.acs.org/doi/abs/10.1021/acs.jcim.3c00899},\\nyear = {2023},\\n}\\n', homepage='https://nakatamaho.riken.jp/pubchemqc.riken.jp/b3lyp_pm6_datasets.html', license='https://creativecommons.org/licenses/by/4.0', features={'cid': Value(dtype='int64', id=None), 'state': Value(dtype='string', id=None), 'pubchem-inchi': Value(dtype='string', id=None), 'pubchem-charge': Value(dtype='int64', id=None), 'pubchem-version': Value(dtype='string', id=None), 'name': Value(dtype='string', id=None), 'coordinates': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'atomic-numbers': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'atom-count': Value(dtype='int64', id=None), 'heavy-atom-count': Value(dtype='int64', id=None), 'core-electrons': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'bond-order': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'connection-indices': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'formula': Value(dtype='string', id=None), 'version': Value(dtype='string', id=None), 'obabel-inchi': Value(dtype='string', id=None), 'pm6-obabel-canonical-smiles': Value(dtype='string', id=None), 'charge': Value(dtype='int64', id=None), 'energy-beta-gap': Value(dtype='float64', id=None), 'energy-beta-homo': Value(dtype='float64', id=None), 'energy-beta-lumo': Value(dtype='float64', id=None), 'energy-alpha-gap': Value(dtype='float64', id=None), 'energy-alpha-homo': Value(dtype='float64', id=None), 'energy-alpha-lumo': Value(dtype='float64', id=None), 'total-energy': Value(dtype='float64', id=None), 'homos': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'orbital-energies': Sequence(feature=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), length=-1, id=None), 'mo-count': Value(dtype='int64', id=None), 'basis-count': Value(dtype='int64', id=None), 'multiplicity': Value(dtype='int64', id=None), 'molecular-mass': Value(dtype='float64', id=None), 'number-of-atoms': Value(dtype='int64', id=None), 'lowdin-partial-charges': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'mulliken-partial-charges': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'dipole-moment': Value(dtype='float64', id=None), 'pubchem-multiplicity': Value(dtype='int64', id=None), 'pubchem-obabel-canonical-smiles': Value(dtype='string', id=None), 'pubchem-isomeric-smiles': Value(dtype='string', id=None), 'pubchem-molecular-weight': Value(dtype='float64', id=None), 'pubchem-molecular-formula': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='pubchemqc-b3lyp', dataset_name='pubchemqc-b3lyp', config_name='b3lyp_pm6', version=0.0.0, splits={'train': {'name': 'train', 'dataset_name': 'molssiai-hub/pubchemqc-b3lyp'}}, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)"
378
- ]
379
- },
380
- "execution_count": 12,
381
- "metadata": {},
382
- "output_type": "execute_result"
383
- }
384
- ],
385
  "source": [
386
- "get_dataset_config_info(path)"
 
 
 
387
  ]
388
  }
389
  ],
@@ -403,7 +241,7 @@
403
  "name": "python",
404
  "nbconvert_exporter": "python",
405
  "pygments_lexer": "ipython3",
406
- "version": "3.9.5"
407
  }
408
  },
409
  "nbformat": 4,
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4b572b87",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Working with Metadata"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "a11900d1",
14
+ "metadata": {},
15
+ "source": [
16
+ "This demo focuses on getting to know the data we are going to work with before\n",
17
+ "downloading it and start processing it. Here, we are going to use the\n",
18
+ "[PubChemQC-B3LYP/6-31G*//PM6\n",
19
+ "Dataset](https://huggingface.co/datasets/molssiai-hub/pubchemqc-b3lyp) (PubChemQC-B3LYP for short)\n",
20
+ "from the [MolSSI AI Hub](https://huggingface.co/molssiai-hub).\n",
21
+ "\n",
22
+ "\n",
23
+ "In order to be able to load and interact with the data, we need to import the necessary libraries."
24
+ ]
25
+ },
26
  {
27
  "cell_type": "code",
28
+ "execution_count": null,
29
  "id": "2493a0c3-4b27-496a-9514-32fb4941c94e",
30
  "metadata": {
31
  "tags": []
32
  },
33
  "outputs": [],
34
  "source": [
35
+ "import datasets # Hugging Face datasets library\n",
36
+ "from datasets import (\n",
37
+ " get_dataset_config_info, # Get information about a dataset configurations/subsets\n",
38
+ " get_dataset_config_names, # Get the list of names of all dataset configurations/subsets\n",
39
+ " get_dataset_split_names, # Get the list of names of all dataset splits\n",
40
+ " get_dataset_default_config_name # Get the default configuration name of a dataset\n",
 
41
  ")\n",
42
+ "from pprint import pprint # Pretty print"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "b362b39e",
48
+ "metadata": {},
49
+ "source": [
50
+ "After importing the modules, we set a few variables that will be used throughout\n",
51
+ "this demo."
52
  ]
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": null,
57
  "id": "c4892171-f79f-4eee-99db-a21d11b09e5c",
58
  "metadata": {
59
  "tags": []
60
  },
61
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  "source": [
63
+ "# path to the dataset repository on the Hugging Face Hub\n",
64
+ "path = \"molssiai-hub/pubchemqc-b3lyp\"\n",
65
+ "\n",
66
+ "# set the dataset configuration/subset name\n",
67
+ "name = \"b3lyp_pm6\"\n",
68
+ "\n",
69
+ "# set the dataset split\n",
70
+ "split = \"train\""
71
  ]
72
  },
73
  {
74
+ "cell_type": "markdown",
75
+ "id": "affb8bf2",
76
+ "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  "source": [
78
+ "The modules and functions we imported allow us to inspect our dataset for a wide\n",
79
+ "range of metadata and information without actually downloading it on disk. For\n",
80
+ "example, we can access the list of all available configurations/subsets, splits,\n",
81
+ "and the default configuration name in our dataset.\n",
82
+ "\n",
83
+ "The `get_dataset_config_info` function returns a `datasets.info.DatasetInfo` \n",
84
+ "object which contains the metadata of our dataset configuration all in one place."
85
  ]
86
  },
87
  {
88
  "cell_type": "code",
89
+ "execution_count": null,
90
+ "id": "b0b1b1cf",
91
+ "metadata": {},
92
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "source": [
94
+ "# get the information about the PubChemQC-B3LYP dataset configuration/subset\n",
95
+ "config_info = get_dataset_config_info(path, name)\n",
96
+ "\n",
97
+ "# print the retrieved information about the PubChemQC-B3LYP dataset\n",
98
+ "print(\"Information about the PubChemQC-B3LYP dataset configuration/subset:\")\n",
99
+ "pprint(config_info,\n",
100
+ " indent=4,\n",
101
+ " width=100,\n",
102
+ " compact=True)"
103
  ]
104
  },
105
  {
106
+ "cell_type": "markdown",
107
+ "id": "c01684ca",
108
+ "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  "source": [
110
+ "Processing a lengthy output is not always convenient. In order to make the\n",
111
+ "metadata inspection easier, we can access specific attributes of the\n",
112
+ "`datasets.info.DatasetInfo` instance directly. For example, the `description`\n",
113
+ "attribute can provide access to the content of the *description* field in the\n",
114
+ "dataset configuration as shown below"
115
  ]
116
  },
117
  {
118
  "cell_type": "code",
119
+ "execution_count": null,
120
+ "id": "d7ca2dff",
121
+ "metadata": {},
 
 
122
  "outputs": [],
123
  "source": [
124
+ "pprint(config_info.description,\n",
125
+ " indent=4,\n",
126
+ " width=100,\n",
127
+ " compact=True)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "markdown",
132
+ "id": "7564c7f6",
133
+ "metadata": {},
134
+ "source": [
135
+ "We can use other imported `get_dataset_*` helper functions to directly access\n",
136
+ "the metadata and circumvent the creation of a `datasets.info.DatasetInfo` object\n",
137
+ "as shown below"
138
  ]
139
  },
140
  {
141
  "cell_type": "code",
142
+ "execution_count": null,
143
+ "id": "f50da911",
144
+ "metadata": {},
145
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  "source": [
147
+ "# get the list of all available dataset configurations/subsets in the PubChemQC-B3LYP dataset\n",
148
+ "config_names = get_dataset_config_names(path)\n",
149
+ "\n",
150
+ "# print the retrieved information about the PubChemQC-B3LYP dataset\n",
151
+ "print(\"List of available dataset configurations/subsets in the PubChemQC-B3LYP dataset:\")\n",
152
+ "config_names"
153
  ]
154
  },
155
  {
156
  "cell_type": "code",
157
+ "execution_count": null,
158
+ "id": "0a925405",
159
+ "metadata": {},
160
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  "source": [
162
+ "# get the list of all available dataset splits in the PubChemQC-B3LYP dataset\n",
163
+ "split_names = get_dataset_split_names(path, name)\n",
164
+ "\n",
165
+ "# print the retrieved information about the PubChemQC-B3LYP dataset\n",
166
+ "print(f\"List of available dataset splits in the PubChemQC-B3LYP dataset:\")\n",
167
+ "split_names"
168
  ]
169
  },
170
  {
171
  "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "2534c098",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "# get the default configuration name of the PubChemQC-B3LYP dataset\n",
178
+ "default_config_name = get_dataset_default_config_name(path)\n",
179
+ "\n",
180
+ "# print the retrieved information about the PubChemQC-B3LYP dataset\n",
181
+ "print(f\"Default configuration name of the PubChemQC-B3LYP dataset: {default_config_name}\")"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "id": "5e477ae9",
187
+ "metadata": {},
 
188
  "source": [
189
+ "We can also list the available features in our dataset"
190
  ]
191
  },
192
  {
193
  "cell_type": "code",
194
+ "execution_count": null,
195
+ "id": "edefb732",
196
+ "metadata": {},
197
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  "source": [
199
+ "config_info.features"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "id": "a6ad4a42",
205
+ "metadata": {},
206
+ "source": [
207
+ "The `list()` function can be removed from the aforementioned command in order to\n",
208
+ "create a dictionary of features alongside their corresponding data types.\n",
209
+ "\n",
210
+ "We can also access the citation information using the `citation` attribute\n",
211
+ "as shown below"
212
  ]
213
  },
214
  {
215
  "cell_type": "code",
216
+ "execution_count": null,
217
+ "id": "2749b3e7",
218
+ "metadata": {},
219
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  "source": [
221
+ "pprint(config_info.citation,\n",
222
+ " indent=4,\n",
223
+ " width=100,\n",
224
+ " compact=True)"
225
  ]
226
  }
227
  ],
 
241
  "name": "python",
242
  "nbconvert_exporter": "python",
243
  "pygments_lexer": "ipython3",
244
+ "version": "3.10.13"
245
  }
246
  },
247
  "nbformat": 4,