kiyer commited on
Commit
888f3cd
·
verified ·
1 Parent(s): ed3bbb8

Upload 13 files

Browse files
arxiv_corpus/dataset_dict.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"splits": ["train"]}
arxiv_corpus/train/data-00000-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:592316b955cd556a7f857b34ae6d3cbc2d41607ebc7edf0f4936dcbc72d9b210
3
+ size 466235848
arxiv_corpus/train/data-00001-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8dc80f91831e6ec8d887220fa58bd7fc9a24a45f3f8f700386ea5f51465566c
3
+ size 457426648
arxiv_corpus/train/data-00002-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c83ff443d42f500930beff99b7ebcb99c6b13dbd4a2706d511871b846d5743
3
+ size 458475864
arxiv_corpus/train/data-00003-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a57e350cef8e17b0fc039d731803470f6984161edfff0ef563268dd8ab34238
3
+ size 459603008
arxiv_corpus/train/data-00004-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:290e397fbf9dc69f69bdc253b639f141c55923def25de26634a5bf32979b64cb
3
+ size 458884624
arxiv_corpus/train/data-00005-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532dc21466633c66ddf9f836211ecf5d8418facf9770e697567f4f86185df6e7
3
+ size 465391392
arxiv_corpus/train/data-00006-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8418f4d85e2c51dab08103274b7129a0e18577f85101aab89f6eaae8a388ce
3
+ size 454939008
arxiv_corpus/train/data-00007-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3b772eb9d0190efcac9852ee98f381d015f231bffcc45704415d347fc5624c
3
+ size 448746432
arxiv_corpus/train/data-00008-of-00009.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cd1cabc942ca520de26ad97f87bb9e491225d548096414eb9387da8b7e2c1fc
3
+ size 459532024
arxiv_corpus/train/dataset_info.json ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "parquet",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "astro_paper_corpus",
6
+ "dataset_size": 4128813829,
7
+ "description": "",
8
+ "download_checksums": {
9
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00000-of-00009.parquet": {
10
+ "num_bytes": 240072323,
11
+ "checksum": null
12
+ },
13
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00001-of-00009.parquet": {
14
+ "num_bytes": 235851056,
15
+ "checksum": null
16
+ },
17
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00002-of-00009.parquet": {
18
+ "num_bytes": 236413937,
19
+ "checksum": null
20
+ },
21
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00003-of-00009.parquet": {
22
+ "num_bytes": 237728419,
23
+ "checksum": null
24
+ },
25
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00004-of-00009.parquet": {
26
+ "num_bytes": 236710419,
27
+ "checksum": null
28
+ },
29
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00005-of-00009.parquet": {
30
+ "num_bytes": 239567004,
31
+ "checksum": null
32
+ },
33
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00006-of-00009.parquet": {
34
+ "num_bytes": 234863979,
35
+ "checksum": null
36
+ },
37
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00007-of-00009.parquet": {
38
+ "num_bytes": 232662046,
39
+ "checksum": null
40
+ },
41
+ "hf://datasets/JSALT2024-Astro-LLMs/astro_paper_corpus@b957a28700badb3b5f5c7af06ea77a2560ab6e46/data/train-00008-of-00009.parquet": {
42
+ "num_bytes": 237444927,
43
+ "checksum": null
44
+ }
45
+ },
46
+ "download_size": 2131314110,
47
+ "features": {
48
+ "id": {
49
+ "dtype": "string",
50
+ "_type": "Value"
51
+ },
52
+ "author": {
53
+ "feature": {
54
+ "dtype": "string",
55
+ "_type": "Value"
56
+ },
57
+ "_type": "Sequence"
58
+ },
59
+ "bibcode": {
60
+ "dtype": "string",
61
+ "_type": "Value"
62
+ },
63
+ "title": {
64
+ "feature": {
65
+ "dtype": "string",
66
+ "_type": "Value"
67
+ },
68
+ "_type": "Sequence"
69
+ },
70
+ "citation_count": {
71
+ "dtype": "int64",
72
+ "_type": "Value"
73
+ },
74
+ "aff": {
75
+ "feature": {
76
+ "dtype": "string",
77
+ "_type": "Value"
78
+ },
79
+ "_type": "Sequence"
80
+ },
81
+ "citation": {
82
+ "feature": {
83
+ "dtype": "string",
84
+ "_type": "Value"
85
+ },
86
+ "_type": "Sequence"
87
+ },
88
+ "database": {
89
+ "feature": {
90
+ "dtype": "string",
91
+ "_type": "Value"
92
+ },
93
+ "_type": "Sequence"
94
+ },
95
+ "read_count": {
96
+ "dtype": "int64",
97
+ "_type": "Value"
98
+ },
99
+ "keyword": {
100
+ "feature": {
101
+ "dtype": "string",
102
+ "_type": "Value"
103
+ },
104
+ "_type": "Sequence"
105
+ },
106
+ "reference": {
107
+ "feature": {
108
+ "dtype": "string",
109
+ "_type": "Value"
110
+ },
111
+ "_type": "Sequence"
112
+ },
113
+ "doi": {
114
+ "feature": {
115
+ "dtype": "string",
116
+ "_type": "Value"
117
+ },
118
+ "_type": "Sequence"
119
+ },
120
+ "subfolder": {
121
+ "dtype": "string",
122
+ "_type": "Value"
123
+ },
124
+ "filename": {
125
+ "dtype": "string",
126
+ "_type": "Value"
127
+ },
128
+ "introduction": {
129
+ "dtype": "string",
130
+ "_type": "Value"
131
+ },
132
+ "conclusions": {
133
+ "dtype": "string",
134
+ "_type": "Value"
135
+ },
136
+ "year": {
137
+ "dtype": "int64",
138
+ "_type": "Value"
139
+ },
140
+ "month": {
141
+ "dtype": "int64",
142
+ "_type": "Value"
143
+ },
144
+ "arxiv_id": {
145
+ "dtype": "string",
146
+ "_type": "Value"
147
+ },
148
+ "abstract": {
149
+ "dtype": "string",
150
+ "_type": "Value"
151
+ },
152
+ "failed_ids": {
153
+ "dtype": "bool",
154
+ "_type": "Value"
155
+ },
156
+ "keyword_search": {
157
+ "feature": {
158
+ "dtype": "string",
159
+ "_type": "Value"
160
+ },
161
+ "_type": "Sequence"
162
+ },
163
+ "umap_x": {
164
+ "dtype": "float32",
165
+ "_type": "Value"
166
+ },
167
+ "umap_y": {
168
+ "dtype": "float32",
169
+ "_type": "Value"
170
+ },
171
+ "clust_id": {
172
+ "dtype": "int64",
173
+ "_type": "Value"
174
+ }
175
+ },
176
+ "homepage": "",
177
+ "license": "",
178
+ "size_in_bytes": 6260127939,
179
+ "splits": {
180
+ "train": {
181
+ "name": "train",
182
+ "num_bytes": 4128813829,
183
+ "num_examples": 271544,
184
+ "shard_lengths": [
185
+ 33172,
186
+ 33172,
187
+ 33172,
188
+ 33172,
189
+ 33172,
190
+ 33171,
191
+ 34171,
192
+ 34171,
193
+ 4171
194
+ ],
195
+ "dataset_name": "astro_paper_corpus"
196
+ }
197
+ },
198
+ "version": {
199
+ "version_str": "0.0.0",
200
+ "major": 0,
201
+ "minor": 0,
202
+ "patch": 0
203
+ }
204
+ }
arxiv_corpus/train/state.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00009.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00009.arrow"
8
+ },
9
+ {
10
+ "filename": "data-00002-of-00009.arrow"
11
+ },
12
+ {
13
+ "filename": "data-00003-of-00009.arrow"
14
+ },
15
+ {
16
+ "filename": "data-00004-of-00009.arrow"
17
+ },
18
+ {
19
+ "filename": "data-00005-of-00009.arrow"
20
+ },
21
+ {
22
+ "filename": "data-00006-of-00009.arrow"
23
+ },
24
+ {
25
+ "filename": "data-00007-of-00009.arrow"
26
+ },
27
+ {
28
+ "filename": "data-00008-of-00009.arrow"
29
+ }
30
+ ],
31
+ "_fingerprint": "b9db3ec46232aa87",
32
+ "_format_columns": null,
33
+ "_format_kwargs": {},
34
+ "_format_type": null,
35
+ "_output_all_columns": false,
36
+ "_split": "train"
37
+ }
requirements.txt CHANGED
@@ -9,6 +9,7 @@ langchain_openai
9
  langchain_community
10
  langchain_core
11
  openai
 
12
  feedparser
13
  tiktoken
14
  chromadb
 
9
  langchain_community
10
  langchain_core
11
  openai
12
+ anthropic
13
  feedparser
14
  tiktoken
15
  chromadb