Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

.gitattributes +1 -0
checkpoint-2990/config.json +48 -0
checkpoint-2990/model.safetensors +3 -0
checkpoint-2990/optimizer.pt +3 -0
checkpoint-2990/rng_state.pth +3 -0
checkpoint-2990/scheduler.pt +3 -0
checkpoint-2990/trainer_state.json +104 -0
checkpoint-2990/training_args.bin +3 -0
config.json +48 -0
model.safetensors +3 -0
model_card.md +140 -0
runs/May12_16-32-17_chirchiq/events.out.tfevents.1747060338.chirchiq.3136082.2 +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +55 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-2990/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "architectures": [
+    "XLMRobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "business_rnd_innovation",
+    "1": "fellowships_scholarships",
+    "2": "institutional_funding",
+    "3": "networking_collaborative",
+    "4": "other_research_funding",
+    "5": "out_of_scope",
+    "6": "project_grants_public",
+    "7": "research_infrastructure"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "business_rnd_innovation": 0,
+    "fellowships_scholarships": 1,
+    "institutional_funding": 2,
+    "networking_collaborative": 3,
+    "other_research_funding": 4,
+    "out_of_scope": 5,
+    "project_grants_public": 6,
+    "research_infrastructure": 7
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

checkpoint-2990/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
+size 2239643272

checkpoint-2990/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9f03d46db8f129af1cc6185169cab8b00b82037be4816b92b119031854a32cb
+size 4479527522

checkpoint-2990/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66f009a1bddfd412720740b742f276612d37f4bcd6227cf1588f71352d564ca8
+size 14645

checkpoint-2990/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18439f5f83692594c2ba03ec3af20dbaafd6b5f04eb26b9417057d357694ece2
+size 1465

checkpoint-2990/trainer_state.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 2990,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.0,
+      "grad_norm": 118.27540588378906,
+      "learning_rate": 1.8006688963210704e-05,
+      "loss": 1.149,
+      "step": 299
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.5980421304702759,
+      "learning_rate": 1.6006688963210703e-05,
+      "loss": 0.5784,
+      "step": 598
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.44770318269729614,
+      "learning_rate": 1.4006688963210704e-05,
+      "loss": 0.3926,
+      "step": 897
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.40715572237968445,
+      "learning_rate": 1.2006688963210704e-05,
+      "loss": 0.2783,
+      "step": 1196
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 22.37042236328125,
+      "learning_rate": 1.0006688963210703e-05,
+      "loss": 0.1735,
+      "step": 1495
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.030705075711011887,
+      "learning_rate": 8.006688963210702e-06,
+      "loss": 0.1021,
+      "step": 1794
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.011579714715480804,
+      "learning_rate": 6.006688963210703e-06,
+      "loss": 0.0736,
+      "step": 2093
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.012278182432055473,
+      "learning_rate": 4.006688963210703e-06,
+      "loss": 0.0391,
+      "step": 2392
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.006934181787073612,
+      "learning_rate": 2.0066889632107025e-06,
+      "loss": 0.0145,
+      "step": 2691
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.021311219781637192,
+      "learning_rate": 6.688963210702342e-09,
+      "loss": 0.0131,
+      "step": 2990
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 2990,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.223633311121408e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2990/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
+size 5777

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "architectures": [
+    "XLMRobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "business_rnd_innovation",
+    "1": "fellowships_scholarships",
+    "2": "institutional_funding",
+    "3": "networking_collaborative",
+    "4": "other_research_funding",
+    "5": "out_of_scope",
+    "6": "project_grants_public",
+    "7": "research_infrastructure"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "business_rnd_innovation": 0,
+    "fellowships_scholarships": 1,
+    "institutional_funding": 2,
+    "networking_collaborative": 3,
+    "other_research_funding": 4,
+    "out_of_scope": 5,
+    "project_grants_public": 6,
+    "research_infrastructure": 7
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e36bdc2610d08516d7ddbae7c87e2c9b52bc342b2cc82623abd7dd313210ab3f
+size 2239643272

model_card.md ADDED Viewed

	@@ -0,0 +1,140 @@

+---
+language:
+- multilingual
+license: mit
+tags:
+- grant-classification
+- research-funding
+- oecd
+- multilingual
+- text-classification
+datasets:
+- your-username/grant-classification-dataset
+base_model: intfloat/multilingual-e5-large
+model-index:
+- name: Grant Classification Model
+  results:
+  - task:
+      type: text-classification
+      name: Research Grant Classification
+---
+# Grant Classification Model
+This model classifies research grants according to a custom taxonomy based on OECD's categorization of science, technology, and innovation (STI) policy instruments.
+## Model Description
+- **Model architecture**: Fine-tuned version of [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
+- **Language(s)**: Multilingual
+- **License**: MIT
+- **Limitations**: The model is specialized for grant classification and may not perform well on other text classification tasks
+## Usage
+### Basic usage
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+# Load model and tokenizer
+model_name = "your-username/grant-classification-model"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+# Create classification pipeline
+classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
+# Example grant text
+grant_text = """
+Title: Advancing Quantum Computing Applications in Drug Discovery
+Abstract: This project aims to develop novel quantum algorithms for simulating molecular interactions to accelerate the drug discovery process. The research will focus on overcoming current limitations in quantum hardware by developing error-mitigation techniques specific to chemistry applications.
+Funder: National Science Foundation
+Funding Scheme: Quantum Leap Challenge Institutes
+Beneficiary: University of California, Berkeley
+"""
+# Get prediction
+result = classifier(grant_text)
+print(f"Predicted category: {result[0]['label']}")
+print(f"Confidence: {result[0]['score']:.4f}")
+```
+### Batch processing for multiple grants
+```python
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
+# Load model and tokenizer
+model_name = "your-username/grant-classification-model"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+# Create classification pipeline
+classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
+# Function to prepare grant text
+def prepare_grant_text(row):
+    parts = []
+    if row.get('title'):
+        parts.append(f"Title: {row['title']}")
+    if row.get('abstract'):
+        parts.append(f"Abstract: {row['abstract']}")
+    if row.get('funder'):
+        parts.append(f"Funder: {row['funder']}")
+    if row.get('funding_scheme'):
+        parts.append(f"Funding Scheme: {row['funding_scheme']}")
+    if row.get('beneficiary'):
+        parts.append(f"Beneficiary: {row['beneficiary']}")
+    return "\n".join(parts)
+# Example data
+grants_df = pd.read_csv("grants.csv")
+grants_df['text_for_model'] = grants_df.apply(prepare_grant_text, axis=1)
+# Classify grants
+results = classifier(grants_df['text_for_model'].tolist())
+# Add results to dataframe
+grants_df['predicted_category'] = [r['label'] for r in results]
+grants_df['confidence'] = [r['score'] for r in results]
+```
+## Classification Categories
+The model classifies grants into the following categories:
+1. **business_rnd_innovation**: Direct allocation of funding to private firms for R&D and innovation activities with commercial applications
+2. **fellowships_scholarships**: Financial support for individual researchers or higher education students
+3. **institutional_funding**: Core funding for higher education institutions and public research institutes
+4. **networking_collaborative**: Tools to bring together various actors within the innovation system
+5. **other_research_funding**: Alternative funding mechanisms for R&D or higher education
+6. **out_of_scope**: Grants unrelated to research, development, or innovation
+7. **project_grants_public**: Direct funding for specific research projects in public institutions
+8. **research_infrastructure**: Funding for research facilities, equipment, and resources
+## Training
+This model was fine-tuned on a dataset of grant documents with annotations derived from a consensus of multiple LLM predictions (Gemma, Mistral, Qwen) and human validation.
+The training process included:
+- Base model: [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)
+- Training approach: Fine-tuning with early stopping
+- Optimization: AdamW optimizer with weight decay
+- Sequence length: 512 tokens
+- Batch size: 8
+- Learning rate: 2e-5
+## Citation and References
+This model is based on a custom taxonomy derived from the OECD's categorization of science, technology, and innovation (STI) policy instruments.
+For more information, see:
+EC/OECD (2023), STIP Survey, https://stip.oecd.org
+## Acknowledgements
+- The model builds upon [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large)

runs/May12_16-32-17_chirchiq/events.out.tfevents.1747060338.chirchiq.3136082.2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99c79982739b55c6bfbde3571c3b715cc6e4a7d9065a12d22a3daa8ba2201a55
+size 8124

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ffb37461c391f096759f4a9bbbc329da0f36952f88bab061fcf84940c022e98
+size 17082999

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4d201db73713b21d5af27fb0dff2b381cec5a9982385f1d35f1c8eac55fec23
+size 5777