Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- checkpoint-2990/config.json +48 -0
- checkpoint-2990/model.safetensors +3 -0
- checkpoint-2990/optimizer.pt +3 -0
- checkpoint-2990/rng_state.pth +3 -0
- checkpoint-2990/scheduler.pt +3 -0
- checkpoint-2990/trainer_state.json +104 -0
- checkpoint-2990/training_args.bin +3 -0
- config.json +48 -0
- model.safetensors +3 -0
- model_card.md +140 -0
- runs/May12_16-32-17_chirchiq/events.out.tfevents.1747060338.chirchiq.3136082.2 +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +55 -0
- training_args.bin +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
checkpoint-2990/config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"XLMRobertaForSequenceClassification"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"id2label": {
|
13 |
+
"0": "business_rnd_innovation",
|
14 |
+
"1": "fellowships_scholarships",
|
15 |
+
"2": "institutional_funding",
|
16 |
+
"3": "networking_collaborative",
|
17 |
+
"4": "other_research_funding",
|
18 |
+
"5": "out_of_scope",
|
19 |
+
"6": "project_grants_public",
|
20 |
+
"7": "research_infrastructure"
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"label2id": {
|
25 |
+
"business_rnd_innovation": 0,
|
26 |
+
"fellowships_scholarships": 1,
|
27 |
+
"institutional_funding": 2,
|
28 |
+
"networking_collaborative": 3,
|
29 |
+
"other_research_funding": 4,
|
30 |
+
"out_of_scope": 5,
|
31 |
+
"project_grants_public": 6,
|
32 |
+
"research_infrastructure": 7
|
33 |
+
},
|
34 |
+
"layer_norm_eps": 1e-05,
|
35 |
+
"max_position_embeddings": 514,
|
36 |
+
"model_type": "xlm-roberta",
|
37 |
+
"num_attention_heads": 16,
|
38 |
+
"num_hidden_layers": 24,
|
39 |
+
"output_past": true,
|
40 |
+
"pad_token_id": 1,
|
41 |
+
"position_embedding_type": "absolute",
|
42 |
+
"problem_type": "single_label_classification",
|
43 |
+
"torch_dtype": "float32",
|
44 |
+
"transformers_version": "4.51.3",
|
45 |
+
"type_vocab_size": 1,
|
46 |
+
"use_cache": true,
|
47 |
+
"vocab_size": 250002
|
48 |
+
}
|
checkpoint-2990/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
|
3 |
+
size 2239643272
|
checkpoint-2990/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9f03d46db8f129af1cc6185169cab8b00b82037be4816b92b119031854a32cb
|
3 |
+
size 4479527522
|
checkpoint-2990/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66f009a1bddfd412720740b742f276612d37f4bcd6227cf1588f71352d564ca8
|
3 |
+
size 14645
|
checkpoint-2990/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18439f5f83692594c2ba03ec3af20dbaafd6b5f04eb26b9417057d357694ece2
|
3 |
+
size 1465
|
checkpoint-2990/trainer_state.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_global_step": null,
|
3 |
+
"best_metric": null,
|
4 |
+
"best_model_checkpoint": null,
|
5 |
+
"epoch": 10.0,
|
6 |
+
"eval_steps": 500,
|
7 |
+
"global_step": 2990,
|
8 |
+
"is_hyper_param_search": false,
|
9 |
+
"is_local_process_zero": true,
|
10 |
+
"is_world_process_zero": true,
|
11 |
+
"log_history": [
|
12 |
+
{
|
13 |
+
"epoch": 1.0,
|
14 |
+
"grad_norm": 118.27540588378906,
|
15 |
+
"learning_rate": 1.8006688963210704e-05,
|
16 |
+
"loss": 1.149,
|
17 |
+
"step": 299
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"epoch": 2.0,
|
21 |
+
"grad_norm": 1.5980421304702759,
|
22 |
+
"learning_rate": 1.6006688963210703e-05,
|
23 |
+
"loss": 0.5784,
|
24 |
+
"step": 598
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"epoch": 3.0,
|
28 |
+
"grad_norm": 0.44770318269729614,
|
29 |
+
"learning_rate": 1.4006688963210704e-05,
|
30 |
+
"loss": 0.3926,
|
31 |
+
"step": 897
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"epoch": 4.0,
|
35 |
+
"grad_norm": 0.40715572237968445,
|
36 |
+
"learning_rate": 1.2006688963210704e-05,
|
37 |
+
"loss": 0.2783,
|
38 |
+
"step": 1196
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"epoch": 5.0,
|
42 |
+
"grad_norm": 22.37042236328125,
|
43 |
+
"learning_rate": 1.0006688963210703e-05,
|
44 |
+
"loss": 0.1735,
|
45 |
+
"step": 1495
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"epoch": 6.0,
|
49 |
+
"grad_norm": 0.030705075711011887,
|
50 |
+
"learning_rate": 8.006688963210702e-06,
|
51 |
+
"loss": 0.1021,
|
52 |
+
"step": 1794
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"epoch": 7.0,
|
56 |
+
"grad_norm": 0.011579714715480804,
|
57 |
+
"learning_rate": 6.006688963210703e-06,
|
58 |
+
"loss": 0.0736,
|
59 |
+
"step": 2093
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"epoch": 8.0,
|
63 |
+
"grad_norm": 0.012278182432055473,
|
64 |
+
"learning_rate": 4.006688963210703e-06,
|
65 |
+
"loss": 0.0391,
|
66 |
+
"step": 2392
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"epoch": 9.0,
|
70 |
+
"grad_norm": 0.006934181787073612,
|
71 |
+
"learning_rate": 2.0066889632107025e-06,
|
72 |
+
"loss": 0.0145,
|
73 |
+
"step": 2691
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"epoch": 10.0,
|
77 |
+
"grad_norm": 0.021311219781637192,
|
78 |
+
"learning_rate": 6.688963210702342e-09,
|
79 |
+
"loss": 0.0131,
|
80 |
+
"step": 2990
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"logging_steps": 500,
|
84 |
+
"max_steps": 2990,
|
85 |
+
"num_input_tokens_seen": 0,
|
86 |
+
"num_train_epochs": 10,
|
87 |
+
"save_steps": 500,
|
88 |
+
"stateful_callbacks": {
|
89 |
+
"TrainerControl": {
|
90 |
+
"args": {
|
91 |
+
"should_epoch_stop": false,
|
92 |
+
"should_evaluate": false,
|
93 |
+
"should_log": false,
|
94 |
+
"should_save": true,
|
95 |
+
"should_training_stop": true
|
96 |
+
},
|
97 |
+
"attributes": {}
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"total_flos": 2.223633311121408e+16,
|
101 |
+
"train_batch_size": 8,
|
102 |
+
"trial_name": null,
|
103 |
+
"trial_params": null
|
104 |
+
}
|
checkpoint-2990/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
|
3 |
+
size 5777
|
config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"XLMRobertaForSequenceClassification"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"id2label": {
|
13 |
+
"0": "business_rnd_innovation",
|
14 |
+
"1": "fellowships_scholarships",
|
15 |
+
"2": "institutional_funding",
|
16 |
+
"3": "networking_collaborative",
|
17 |
+
"4": "other_research_funding",
|
18 |
+
"5": "out_of_scope",
|
19 |
+
"6": "project_grants_public",
|
20 |
+
"7": "research_infrastructure"
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"label2id": {
|
25 |
+
"business_rnd_innovation": 0,
|
26 |
+
"fellowships_scholarships": 1,
|
27 |
+
"institutional_funding": 2,
|
28 |
+
"networking_collaborative": 3,
|
29 |
+
"other_research_funding": 4,
|
30 |
+
"out_of_scope": 5,
|
31 |
+
"project_grants_public": 6,
|
32 |
+
"research_infrastructure": 7
|
33 |
+
},
|
34 |
+
"layer_norm_eps": 1e-05,
|
35 |
+
"max_position_embeddings": 514,
|
36 |
+
"model_type": "xlm-roberta",
|
37 |
+
"num_attention_heads": 16,
|
38 |
+
"num_hidden_layers": 24,
|
39 |
+
"output_past": true,
|
40 |
+
"pad_token_id": 1,
|
41 |
+
"position_embedding_type": "absolute",
|
42 |
+
"problem_type": "single_label_classification",
|
43 |
+
"torch_dtype": "float32",
|
44 |
+
"transformers_version": "4.51.3",
|
45 |
+
"type_vocab_size": 1,
|
46 |
+
"use_cache": true,
|
47 |
+
"vocab_size": 250002
|
48 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
|
3 |
+
size 2239643272
|
model_card.md
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- multilingual
|
4 |
+
license: mit
|
5 |
+
tags:
|
6 |
+
- grant-classification
|
7 |
+
- research-funding
|
8 |
+
- oecd
|
9 |
+
- multilingual
|
10 |
+
- text-classification
|
11 |
+
datasets:
|
12 |
+
- your-username/grant-classification-dataset
|
13 |
+
base_model: intfloat/multilingual-e5-large
|
14 |
+
model-index:
|
15 |
+
- name: Grant Classification Model
|
16 |
+
results:
|
17 |
+
- task:
|
18 |
+
type: text-classification
|
19 |
+
name: Research Grant Classification
|
20 |
+
---
|
21 |
+
|
22 |
+
|
23 |
+
# Grant Classification Model
|
24 |
+
|
25 |
+
This model classifies research grants according to a custom taxonomy based on OECD's categorization of science, technology, and innovation (STI) policy instruments.
|
26 |
+
|
27 |
+
## Model Description
|
28 |
+
|
29 |
+
- **Model architecture**: Fine-tuned version of [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
|
30 |
+
- **Language(s)**: Multilingual
|
31 |
+
- **License**: MIT
|
32 |
+
- **Limitations**: The model is specialized for grant classification and may not perform well on other text classification tasks
|
33 |
+
|
34 |
+
## Usage
|
35 |
+
|
36 |
+
### Basic usage
|
37 |
+
|
38 |
+
```python
|
39 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
40 |
+
|
41 |
+
# Load model and tokenizer
|
42 |
+
model_name = "your-username/grant-classification-model"
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
44 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
45 |
+
|
46 |
+
# Create classification pipeline
|
47 |
+
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
48 |
+
|
49 |
+
# Example grant text
|
50 |
+
grant_text = """
|
51 |
+
Title: Advancing Quantum Computing Applications in Drug Discovery
|
52 |
+
Abstract: This project aims to develop novel quantum algorithms for simulating molecular interactions to accelerate the drug discovery process. The research will focus on overcoming current limitations in quantum hardware by developing error-mitigation techniques specific to chemistry applications.
|
53 |
+
Funder: National Science Foundation
|
54 |
+
Funding Scheme: Quantum Leap Challenge Institutes
|
55 |
+
Beneficiary: University of California, Berkeley
|
56 |
+
"""
|
57 |
+
|
58 |
+
# Get prediction
|
59 |
+
result = classifier(grant_text)
|
60 |
+
print(f"Predicted category: {result[0]['label']}")
|
61 |
+
print(f"Confidence: {result[0]['score']:.4f}")
|
62 |
+
```
|
63 |
+
|
64 |
+
### Batch processing for multiple grants
|
65 |
+
|
66 |
+
```python
|
67 |
+
import pandas as pd
|
68 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
|
69 |
+
|
70 |
+
# Load model and tokenizer
|
71 |
+
model_name = "your-username/grant-classification-model"
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
73 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
74 |
+
|
75 |
+
# Create classification pipeline
|
76 |
+
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
77 |
+
|
78 |
+
# Function to prepare grant text
|
79 |
+
def prepare_grant_text(row):
|
80 |
+
parts = []
|
81 |
+
if row.get('title'):
|
82 |
+
parts.append(f"Title: {row['title']}")
|
83 |
+
if row.get('abstract'):
|
84 |
+
parts.append(f"Abstract: {row['abstract']}")
|
85 |
+
if row.get('funder'):
|
86 |
+
parts.append(f"Funder: {row['funder']}")
|
87 |
+
if row.get('funding_scheme'):
|
88 |
+
parts.append(f"Funding Scheme: {row['funding_scheme']}")
|
89 |
+
if row.get('beneficiary'):
|
90 |
+
parts.append(f"Beneficiary: {row['beneficiary']}")
|
91 |
+
return "\n".join(parts)
|
92 |
+
|
93 |
+
# Example data
|
94 |
+
grants_df = pd.read_csv("grants.csv")
|
95 |
+
grants_df['text_for_model'] = grants_df.apply(prepare_grant_text, axis=1)
|
96 |
+
|
97 |
+
# Classify grants
|
98 |
+
results = classifier(grants_df['text_for_model'].tolist())
|
99 |
+
|
100 |
+
# Add results to dataframe
|
101 |
+
grants_df['predicted_category'] = [r['label'] for r in results]
|
102 |
+
grants_df['confidence'] = [r['score'] for r in results]
|
103 |
+
```
|
104 |
+
|
105 |
+
## Classification Categories
|
106 |
+
|
107 |
+
The model classifies grants into the following categories:
|
108 |
+
|
109 |
+
1. **business_rnd_innovation**: Direct allocation of funding to private firms for R&D and innovation activities with commercial applications
|
110 |
+
2. **fellowships_scholarships**: Financial support for individual researchers or higher education students
|
111 |
+
3. **institutional_funding**: Core funding for higher education institutions and public research institutes
|
112 |
+
4. **networking_collaborative**: Tools to bring together various actors within the innovation system
|
113 |
+
5. **other_research_funding**: Alternative funding mechanisms for R&D or higher education
|
114 |
+
6. **out_of_scope**: Grants unrelated to research, development, or innovation
|
115 |
+
7. **project_grants_public**: Direct funding for specific research projects in public institutions
|
116 |
+
8. **research_infrastructure**: Funding for research facilities, equipment, and resources
|
117 |
+
|
118 |
+
## Training
|
119 |
+
|
120 |
+
This model was fine-tuned on a dataset of grant documents with annotations derived from a consensus of multiple LLM predictions (Gemma, Mistral, Qwen) and human validation.
|
121 |
+
The training process included:
|
122 |
+
|
123 |
+
- Base model: [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
|
124 |
+
- Training approach: Fine-tuning with early stopping
|
125 |
+
- Optimization: AdamW optimizer with weight decay
|
126 |
+
- Sequence length: 512 tokens
|
127 |
+
- Batch size: 8
|
128 |
+
- Learning rate: 2e-5
|
129 |
+
|
130 |
+
## Citation and References
|
131 |
+
|
132 |
+
This model is based on a custom taxonomy derived from the OECD's categorization of science, technology, and innovation (STI) policy instruments.
|
133 |
+
For more information, see:
|
134 |
+
|
135 |
+
EC/OECD (2023), STIP Survey, https://stip.oecd.org
|
136 |
+
|
137 |
+
## Acknowledgements
|
138 |
+
|
139 |
+
- The model builds upon [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
|
140 |
+
|
runs/May12_16-32-17_chirchiq/events.out.tfevents.1747060338.chirchiq.3136082.2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99c79982739b55c6bfbde3571c3b715cc6e4a7d9065a12d22a3daa8ba2201a55
|
3 |
+
size 8124
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ffb37461c391f096759f4a9bbbc329da0f36952f88bab061fcf84940c022e98
|
3 |
+
size 17082999
|
tokenizer_config.json
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"eos_token": "</s>",
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "<mask>",
|
50 |
+
"model_max_length": 512,
|
51 |
+
"pad_token": "<pad>",
|
52 |
+
"sep_token": "</s>",
|
53 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
54 |
+
"unk_token": "<unk>"
|
55 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
|
3 |
+
size 5777
|