KomeijiForce nielsr HF staff commited on
Commit
a9ba4c0
verified
1 Parent(s): d956247

Add pipeline tag, library name, link to paper (#1)

Browse files

- Add pipeline tag, library name, link to paper (564c4b6c624282cf74e0ef0f745252a00484e4e4)


Co-authored-by: Niels Rogge <[email protected]>

Files changed (1) hide show
  1. README.md +174 -1
README.md CHANGED
@@ -1,9 +1,13 @@
1
  ---
2
  license: mit
 
 
3
  ---
4
 
5
  # Cuckoo 馃惁 [[Github]](https://github.com/KomeijiForce/Cuckoo)
6
 
 
 
7
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
8
 
9
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
@@ -155,4 +159,173 @@ sea ['blue']
155
  fire ['red']
156
  night []
157
  ```
158
- which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ library_name: transformers
4
+ pipeline_tag: question-answering
5
  ---
6
 
7
  # Cuckoo 馃惁 [[Github]](https://github.com/KomeijiForce/Cuckoo)
8
 
9
+ This repository contains the model of the paper [Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest](https://huggingface.co/papers/2502.11275).
10
+
11
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
12
 
13
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
 
159
  fire ['red']
160
  night []
161
  ```
162
+ which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
163
+
164
+ # File information
165
+
166
+ The repository contains the following file information:
167
+
168
+ Filename: special_tokens_map.json
169
+ Content: {
170
+ "bos_token": {
171
+ "content": "<s>",
172
+ "lstrip": false,
173
+ "normalized": true,
174
+ "rstrip": false,
175
+ "single_word": false
176
+ },
177
+ "cls_token": {
178
+ "content": "<s>",
179
+ "lstrip": false,
180
+ "normalized": true,
181
+ "rstrip": false,
182
+ "single_word": false
183
+ },
184
+ "eos_token": {
185
+ "content": "</s>",
186
+ "lstrip": false,
187
+ "normalized": true,
188
+ "rstrip": false,
189
+ "single_word": false
190
+ },
191
+ "mask_token": {
192
+ "content": "<mask>",
193
+ "lstrip": true,
194
+ "normalized": false,
195
+ "rstrip": false,
196
+ "single_word": false
197
+ },
198
+ "pad_token": {
199
+ "content": "<pad>",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false
204
+ },
205
+ "sep_token": {
206
+ "content": "</s>",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false
211
+ },
212
+ "unk_token": {
213
+ "content": "<unk>",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false
218
+ }
219
+ }
220
+
221
+ Filename: tokenizer_config.json
222
+ Content: {
223
+ "add_prefix_space": true,
224
+ "added_tokens_decoder": {
225
+ "0": {
226
+ "content": "<s>",
227
+ "lstrip": false,
228
+ "normalized": true,
229
+ "rstrip": false,
230
+ "single_word": false,
231
+ "special": true
232
+ },
233
+ "1": {
234
+ "content": "<pad>",
235
+ "lstrip": false,
236
+ "normalized": true,
237
+ "rstrip": false,
238
+ "single_word": false,
239
+ "special": true
240
+ },
241
+ "2": {
242
+ "content": "</s>",
243
+ "lstrip": false,
244
+ "normalized": true,
245
+ "rstrip": false,
246
+ "single_word": false,
247
+ "special": true
248
+ },
249
+ "3": {
250
+ "content": "<unk>",
251
+ "lstrip": false,
252
+ "normalized": true,
253
+ "rstrip": false,
254
+ "single_word": false,
255
+ "special": true
256
+ },
257
+ "50264": {
258
+ "content": "<mask>",
259
+ "lstrip": true,
260
+ "normalized": false,
261
+ "rstrip": false,
262
+ "single_word": false,
263
+ "special": true
264
+ }
265
+ },
266
+ "bos_token": "<s>",
267
+ "clean_up_tokenization_spaces": false,
268
+ "cls_token": "<s>",
269
+ "eos_token": "</s>",
270
+ "errors": "replace",
271
+ "mask_token": "<mask>",
272
+ "max_length": 512,
273
+ "model_max_length": 512,
274
+ "pad_token": "<pad>",
275
+ "sep_token": "</s>",
276
+ "stride": 0,
277
+ "tokenizer_class": "RobertaTokenizer",
278
+ "trim_offsets": true,
279
+ "truncation_side": "right",
280
+ "truncation_strategy": "longest_first",
281
+ "unk_token": "<unk>"
282
+ }
283
+
284
+ Filename: merges.txt
285
+ Content: "Content of the file is larger than 50 KB, too long to display."
286
+
287
+ Filename: vocab.json
288
+ Content: "Content of the file is larger than 50 KB, too long to display."
289
+
290
+ Filename: config.json
291
+ Content: {
292
+ "_name_or_path": "models/ptr-large-c4-stage9",
293
+ "architectures": [
294
+ "RobertaForTokenClassification"
295
+ ],
296
+ "attention_probs_dropout_prob": 0.1,
297
+ "bos_token_id": 0,
298
+ "classifier_dropout": null,
299
+ "eos_token_id": 2,
300
+ "finetuning_task": "ner",
301
+ "hidden_act": "gelu",
302
+ "hidden_dropout_prob": 0.1,
303
+ "hidden_size": 1024,
304
+ "id2label": {
305
+ "0": "B",
306
+ "1": "I",
307
+ "2": "O"
308
+ },
309
+ "initializer_range": 0.02,
310
+ "intermediate_size": 4096,
311
+ "label2id": {
312
+ "B": 0,
313
+ "I": 1,
314
+ "O": 2
315
+ },
316
+ "layer_norm_eps": 1e-05,
317
+ "max_position_embeddings": 514,
318
+ "model_type": "roberta",
319
+ "num_attention_heads": 16,
320
+ "num_hidden_layers": 24,
321
+ "pad_token_id": 1,
322
+ "position_embedding_type": "absolute",
323
+ "torch_dtype": "float32",
324
+ "transformers_version": "4.45.2",
325
+ "type_vocab_size": 1,
326
+ "use_cache": true,
327
+ "vocab_size": 50265
328
+ }
329
+
330
+ Filename: tokenizer.json
331
+ Content: "Content of the file is larger than 50 KB, too long to display."