{ "builder_name": "parquet", "citation": "", "config_name": "default", "dataset_name": "astro_paper_corpus", "dataset_size": 4128813829, "description": "", "download_checksums": { "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00000-of-00009.parquet": { "num_bytes": 240072323, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00001-of-00009.parquet": { "num_bytes": 235851056, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00002-of-00009.parquet": { "num_bytes": 236413937, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00003-of-00009.parquet": { "num_bytes": 237728419, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00004-of-00009.parquet": { "num_bytes": 236710419, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00005-of-00009.parquet": { "num_bytes": 239567004, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00006-of-00009.parquet": { "num_bytes": 234863979, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00007-of-00009.parquet": { "num_bytes": 232662046, "checksum": null }, "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00008-of-00009.parquet": { "num_bytes": 237444927, "checksum": null } }, "download_size": 2131314110, "features": { "id": { "dtype": "string", "_type": "Value" }, "author": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "bibcode": { "dtype": "string", "_type": "Value" }, "title": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "citation_count": { "dtype": "int64", "_type": "Value" }, "aff": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "citation": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "database": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "read_count": { "dtype": "int64", "_type": "Value" }, "keyword": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "reference": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "doi": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "subfolder": { "dtype": "string", "_type": "Value" }, "filename": { "dtype": "string", "_type": "Value" }, "introduction": { "dtype": "string", "_type": "Value" }, "conclusions": { "dtype": "string", "_type": "Value" }, "year": { "dtype": "int64", "_type": "Value" }, "month": { "dtype": "int64", "_type": "Value" }, "arxiv_id": { "dtype": "string", "_type": "Value" }, "abstract": { "dtype": "string", "_type": "Value" }, "failed_ids": { "dtype": "bool", "_type": "Value" }, "keyword_search": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "Sequence" }, "umap_x": { "dtype": "float32", "_type": "Value" }, "umap_y": { "dtype": "float32", "_type": "Value" }, "clust_id": { "dtype": "int64", "_type": "Value" } }, "homepage": "", "license": "", "size_in_bytes": 6260127939, "splits": { "train": { "name": "train", "num_bytes": 4128813829, "num_examples": 271544, "shard_lengths": [ 33172, 33172, 33172, 33172, 33172, 33171, 34171, 34171, 4171 ], "dataset_name": "astro_paper_corpus" } }, "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 } }