PabloAccuosto commited on
Commit
97f9351
·
verified ·
1 Parent(s): 2840e94

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-2990/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "business_rnd_innovation",
14
+ "1": "fellowships_scholarships",
15
+ "2": "institutional_funding",
16
+ "3": "networking_collaborative",
17
+ "4": "other_research_funding",
18
+ "5": "out_of_scope",
19
+ "6": "project_grants_public",
20
+ "7": "research_infrastructure"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "business_rnd_innovation": 0,
26
+ "fellowships_scholarships": 1,
27
+ "institutional_funding": 2,
28
+ "networking_collaborative": 3,
29
+ "other_research_funding": 4,
30
+ "out_of_scope": 5,
31
+ "project_grants_public": 6,
32
+ "research_infrastructure": 7
33
+ },
34
+ "layer_norm_eps": 1e-05,
35
+ "max_position_embeddings": 514,
36
+ "model_type": "xlm-roberta",
37
+ "num_attention_heads": 16,
38
+ "num_hidden_layers": 24,
39
+ "output_past": true,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "problem_type": "single_label_classification",
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.51.3",
45
+ "type_vocab_size": 1,
46
+ "use_cache": true,
47
+ "vocab_size": 250002
48
+ }
checkpoint-2990/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
3
+ size 2239643272
checkpoint-2990/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9f03d46db8f129af1cc6185169cab8b00b82037be4816b92b119031854a32cb
3
+ size 4479527522
checkpoint-2990/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66f009a1bddfd412720740b742f276612d37f4bcd6227cf1588f71352d564ca8
3
+ size 14645
checkpoint-2990/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18439f5f83692594c2ba03ec3af20dbaafd6b5f04eb26b9417057d357694ece2
3
+ size 1465
checkpoint-2990/trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2990,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 1.0,
14
+ "grad_norm": 118.27540588378906,
15
+ "learning_rate": 1.8006688963210704e-05,
16
+ "loss": 1.149,
17
+ "step": 299
18
+ },
19
+ {
20
+ "epoch": 2.0,
21
+ "grad_norm": 1.5980421304702759,
22
+ "learning_rate": 1.6006688963210703e-05,
23
+ "loss": 0.5784,
24
+ "step": 598
25
+ },
26
+ {
27
+ "epoch": 3.0,
28
+ "grad_norm": 0.44770318269729614,
29
+ "learning_rate": 1.4006688963210704e-05,
30
+ "loss": 0.3926,
31
+ "step": 897
32
+ },
33
+ {
34
+ "epoch": 4.0,
35
+ "grad_norm": 0.40715572237968445,
36
+ "learning_rate": 1.2006688963210704e-05,
37
+ "loss": 0.2783,
38
+ "step": 1196
39
+ },
40
+ {
41
+ "epoch": 5.0,
42
+ "grad_norm": 22.37042236328125,
43
+ "learning_rate": 1.0006688963210703e-05,
44
+ "loss": 0.1735,
45
+ "step": 1495
46
+ },
47
+ {
48
+ "epoch": 6.0,
49
+ "grad_norm": 0.030705075711011887,
50
+ "learning_rate": 8.006688963210702e-06,
51
+ "loss": 0.1021,
52
+ "step": 1794
53
+ },
54
+ {
55
+ "epoch": 7.0,
56
+ "grad_norm": 0.011579714715480804,
57
+ "learning_rate": 6.006688963210703e-06,
58
+ "loss": 0.0736,
59
+ "step": 2093
60
+ },
61
+ {
62
+ "epoch": 8.0,
63
+ "grad_norm": 0.012278182432055473,
64
+ "learning_rate": 4.006688963210703e-06,
65
+ "loss": 0.0391,
66
+ "step": 2392
67
+ },
68
+ {
69
+ "epoch": 9.0,
70
+ "grad_norm": 0.006934181787073612,
71
+ "learning_rate": 2.0066889632107025e-06,
72
+ "loss": 0.0145,
73
+ "step": 2691
74
+ },
75
+ {
76
+ "epoch": 10.0,
77
+ "grad_norm": 0.021311219781637192,
78
+ "learning_rate": 6.688963210702342e-09,
79
+ "loss": 0.0131,
80
+ "step": 2990
81
+ }
82
+ ],
83
+ "logging_steps": 500,
84
+ "max_steps": 2990,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 10,
87
+ "save_steps": 500,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": true
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 2.223633311121408e+16,
101
+ "train_batch_size": 8,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
checkpoint-2990/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
3
+ size 5777
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "business_rnd_innovation",
14
+ "1": "fellowships_scholarships",
15
+ "2": "institutional_funding",
16
+ "3": "networking_collaborative",
17
+ "4": "other_research_funding",
18
+ "5": "out_of_scope",
19
+ "6": "project_grants_public",
20
+ "7": "research_infrastructure"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "label2id": {
25
+ "business_rnd_innovation": 0,
26
+ "fellowships_scholarships": 1,
27
+ "institutional_funding": 2,
28
+ "networking_collaborative": 3,
29
+ "other_research_funding": 4,
30
+ "out_of_scope": 5,
31
+ "project_grants_public": 6,
32
+ "research_infrastructure": 7
33
+ },
34
+ "layer_norm_eps": 1e-05,
35
+ "max_position_embeddings": 514,
36
+ "model_type": "xlm-roberta",
37
+ "num_attention_heads": 16,
38
+ "num_hidden_layers": 24,
39
+ "output_past": true,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "problem_type": "single_label_classification",
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.51.3",
45
+ "type_vocab_size": 1,
46
+ "use_cache": true,
47
+ "vocab_size": 250002
48
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
3
+ size 2239643272
model_card.md ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - multilingual
4
+ license: mit
5
+ tags:
6
+ - grant-classification
7
+ - research-funding
8
+ - oecd
9
+ - multilingual
10
+ - text-classification
11
+ datasets:
12
+ - your-username/grant-classification-dataset
13
+ base_model: intfloat/multilingual-e5-large
14
+ model-index:
15
+ - name: Grant Classification Model
16
+ results:
17
+ - task:
18
+ type: text-classification
19
+ name: Research Grant Classification
20
+ ---
21
+
22
+
23
+ # Grant Classification Model
24
+
25
+ This model classifies research grants according to a custom taxonomy based on OECD's categorization of science, technology, and innovation (STI) policy instruments.
26
+
27
+ ## Model Description
28
+
29
+ - **Model architecture**: Fine-tuned version of [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
30
+ - **Language(s)**: Multilingual
31
+ - **License**: MIT
32
+ - **Limitations**: The model is specialized for grant classification and may not perform well on other text classification tasks
33
+
34
+ ## Usage
35
+
36
+ ### Basic usage
37
+
38
+ ```python
39
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
40
+
41
+ # Load model and tokenizer
42
+ model_name = "your-username/grant-classification-model"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
45
+
46
+ # Create classification pipeline
47
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
48
+
49
+ # Example grant text
50
+ grant_text = """
51
+ Title: Advancing Quantum Computing Applications in Drug Discovery
52
+ Abstract: This project aims to develop novel quantum algorithms for simulating molecular interactions to accelerate the drug discovery process. The research will focus on overcoming current limitations in quantum hardware by developing error-mitigation techniques specific to chemistry applications.
53
+ Funder: National Science Foundation
54
+ Funding Scheme: Quantum Leap Challenge Institutes
55
+ Beneficiary: University of California, Berkeley
56
+ """
57
+
58
+ # Get prediction
59
+ result = classifier(grant_text)
60
+ print(f"Predicted category: {result[0]['label']}")
61
+ print(f"Confidence: {result[0]['score']:.4f}")
62
+ ```
63
+
64
+ ### Batch processing for multiple grants
65
+
66
+ ```python
67
+ import pandas as pd
68
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
69
+
70
+ # Load model and tokenizer
71
+ model_name = "your-username/grant-classification-model"
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
73
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
74
+
75
+ # Create classification pipeline
76
+ classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
77
+
78
+ # Function to prepare grant text
79
+ def prepare_grant_text(row):
80
+ parts = []
81
+ if row.get('title'):
82
+ parts.append(f"Title: {row['title']}")
83
+ if row.get('abstract'):
84
+ parts.append(f"Abstract: {row['abstract']}")
85
+ if row.get('funder'):
86
+ parts.append(f"Funder: {row['funder']}")
87
+ if row.get('funding_scheme'):
88
+ parts.append(f"Funding Scheme: {row['funding_scheme']}")
89
+ if row.get('beneficiary'):
90
+ parts.append(f"Beneficiary: {row['beneficiary']}")
91
+ return "\n".join(parts)
92
+
93
+ # Example data
94
+ grants_df = pd.read_csv("grants.csv")
95
+ grants_df['text_for_model'] = grants_df.apply(prepare_grant_text, axis=1)
96
+
97
+ # Classify grants
98
+ results = classifier(grants_df['text_for_model'].tolist())
99
+
100
+ # Add results to dataframe
101
+ grants_df['predicted_category'] = [r['label'] for r in results]
102
+ grants_df['confidence'] = [r['score'] for r in results]
103
+ ```
104
+
105
+ ## Classification Categories
106
+
107
+ The model classifies grants into the following categories:
108
+
109
+ 1. **business_rnd_innovation**: Direct allocation of funding to private firms for R&D and innovation activities with commercial applications
110
+ 2. **fellowships_scholarships**: Financial support for individual researchers or higher education students
111
+ 3. **institutional_funding**: Core funding for higher education institutions and public research institutes
112
+ 4. **networking_collaborative**: Tools to bring together various actors within the innovation system
113
+ 5. **other_research_funding**: Alternative funding mechanisms for R&D or higher education
114
+ 6. **out_of_scope**: Grants unrelated to research, development, or innovation
115
+ 7. **project_grants_public**: Direct funding for specific research projects in public institutions
116
+ 8. **research_infrastructure**: Funding for research facilities, equipment, and resources
117
+
118
+ ## Training
119
+
120
+ This model was fine-tuned on a dataset of grant documents with annotations derived from a consensus of multiple LLM predictions (Gemma, Mistral, Qwen) and human validation.
121
+ The training process included:
122
+
123
+ - Base model: [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
124
+ - Training approach: Fine-tuning with early stopping
125
+ - Optimization: AdamW optimizer with weight decay
126
+ - Sequence length: 512 tokens
127
+ - Batch size: 8
128
+ - Learning rate: 2e-5
129
+
130
+ ## Citation and References
131
+
132
+ This model is based on a custom taxonomy derived from the OECD's categorization of science, technology, and innovation (STI) policy instruments.
133
+ For more information, see:
134
+
135
+ EC/OECD (2023), STIP Survey, https://stip.oecd.org
136
+
137
+ ## Acknowledgements
138
+
139
+ - The model builds upon [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
140
+
runs/May12_16-32-17_chirchiq/events.out.tfevents.1747060338.chirchiq.3136082.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99c79982739b55c6bfbde3571c3b715cc6e4a7d9065a12d22a3daa8ba2201a55
3
+ size 8124
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffb37461c391f096759f4a9bbbc329da0f36952f88bab061fcf84940c022e98
3
+ size 17082999
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
3
+ size 5777