stevenluo commited on Aug 7, 2024

Commit

ea171c2

verified ·

1 Parent(s): 57a03dd

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

1_Pooling/config.json +10 -0
README.md +57 -0
checkpoint-50/1_Pooling/config.json +10 -0
checkpoint-50/README.md +57 -0
checkpoint-50/config.json +40 -0
checkpoint-50/config_sentence_transformers.json +9 -0
checkpoint-50/model.safetensors +3 -0
checkpoint-50/modules.json +20 -0
checkpoint-50/optimizer.pt +3 -0
checkpoint-50/rng_state.pth +3 -0
checkpoint-50/scheduler.pt +3 -0
checkpoint-50/sentence_bert_config.json +4 -0
checkpoint-50/special_tokens_map.json +37 -0
checkpoint-50/tokenizer.json +0 -0
checkpoint-50/tokenizer_config.json +57 -0
checkpoint-50/trainer_state.json +91 -0
checkpoint-50/training_args.bin +3 -0
checkpoint-50/vocab.txt +0 -0
config.json +40 -0
config_sentence_transformers.json +9 -0
finetune_bge_embedding_v4.sh +55 -0
model.safetensors +3 -0
modules.json +20 -0
runs/Jul14_11-50-34_big-megatron/events.out.tfevents.1720929038.big-megatron.1203838.0 +3 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
training_args.bin +3 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

checkpoint-50/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": true,
+  "pooling_mode_mean_tokens": false,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

checkpoint-50/README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: sentence-transformers
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Normalize()
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

checkpoint-50/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "experiments/embedding/finetune/ft_v4_bge_large_epoch_1_bz_64_trgrp_8_20240714_1150/checkpoint-50",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

checkpoint-50/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.7.0",
+    "transformers": "4.40.2",
+    "pytorch": "2.1.2+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}

checkpoint-50/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc2eb407f866f9f9b7c2eb88c381a9a56ffdc0d5a6ab0566b05bd524bcca617
+size 1302134568

checkpoint-50/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

checkpoint-50/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31bcf898ead0b5e34e92243945337548a37786799f2008b76f18718164fffbf7
+size 2596108193

checkpoint-50/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35c518d94e00dcc3752dacfc3d8723e85d702e6d337300b76c978a21c5c57348
+size 14244

checkpoint-50/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a79f7e5f2f6ce656e9db9e2afb4191d61d7b12d20d152789e301822b4e98d7ee
+size 1064

checkpoint-50/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

checkpoint-50/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-50/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-50/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-50/trainer_state.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8852005532503457,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08852005532503458,
+      "grad_norm": 3.8299479484558105,
+      "learning_rate": 9.964905480067585e-06,
+      "loss": 0.0808,
+      "step": 5
+    },
+    {
+      "epoch": 0.17704011065006917,
+      "grad_norm": 3.019829511642456,
+      "learning_rate": 9.575728086215093e-06,
+      "loss": 0.0219,
+      "step": 10
+    },
+    {
+      "epoch": 0.26556016597510373,
+      "grad_norm": 0.32648399472236633,
+      "learning_rate": 8.787556210808101e-06,
+      "loss": 0.0036,
+      "step": 15
+    },
+    {
+      "epoch": 0.35408022130013833,
+      "grad_norm": 4.698158264160156,
+      "learning_rate": 7.669116889823955e-06,
+      "loss": 0.0295,
+      "step": 20
+    },
+    {
+      "epoch": 0.4426002766251729,
+      "grad_norm": 0.010427301749587059,
+      "learning_rate": 6.3179358303453386e-06,
+      "loss": 0.0034,
+      "step": 25
+    },
+    {
+      "epoch": 0.5311203319502075,
+      "grad_norm": 3.742246627807617,
+      "learning_rate": 4.8518333608872015e-06,
+      "loss": 0.0122,
+      "step": 30
+    },
+    {
+      "epoch": 0.6196403872752421,
+      "grad_norm": 0.05885884910821915,
+      "learning_rate": 3.398650730685813e-06,
+      "loss": 0.0102,
+      "step": 35
+    },
+    {
+      "epoch": 0.7081604426002767,
+      "grad_norm": 0.04803966358304024,
+      "learning_rate": 2.0851026044276405e-06,
+      "loss": 0.0001,
+      "step": 40
+    },
+    {
+      "epoch": 0.7966804979253111,
+      "grad_norm": 3.7399864196777344,
+      "learning_rate": 1.0257277929332332e-06,
+      "loss": 0.0162,
+      "step": 45
+    },
+    {
+      "epoch": 0.8852005532503457,
+      "grad_norm": 0.3424227237701416,
+      "learning_rate": 3.1290169432939556e-07,
+      "loss": 0.0024,
+      "step": 50
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 56,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-50/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07fdc575a441f95028bd3dbc57932f2dbcd8acc2eefd72019a2e86cfd2950989
+size 5368

checkpoint-50/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "experiments/embedding/finetune/ft_v4_bge_large_epoch_1_bz_64_trgrp_8_20240714_1150",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.7.0",
+    "transformers": "4.40.2",
+    "pytorch": "2.1.2+cu121"
+  },
+  "prompts": {},
+  "default_prompt_name": null
+}

finetune_bge_embedding_v4.sh ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/bin/bash
+SCRIP_DIR=$(echo `cd $(dirname $0); pwd`)
+export PATH=/work/cache/env/miniconda3/bin:$PATH
+# export TRAIN_DATASET=/work/home/Projects/IntellijenceCustomerService/检索优化_新抽取QA/data/qa_all_samples_1213_neg_count_7.jsonl
+export TRAIN_DATASET=outputs/v1_20240713/emb_samples_qd_v2.jsonl
+export N_EPOCH=1
+export TRAIN_GROUP_SIZE=8
+export GRADIENT_ACCUMULATION_STEPS=64
+export PER_DEVICE_TRAIN_BATCH_SIZE=1
+export N_NODES=1
+export BATCH_SIZE=`expr ${GRADIENT_ACCUMULATION_STEPS} \* ${PER_DEVICE_TRAIN_BATCH_SIZE} \* ${N_NODES}`
+export VERSION=ft_v4_bge_large_epoch_${N_EPOCH}_bz_${BATCH_SIZE}_trgrp_${TRAIN_GROUP_SIZE}_$(date +"%Y%m%d_%H%M")
+export WANDB_PROJECT=RAG-From-Scratch-Embedding-Finetune
+export WANDB_API_KEY=60bb147be9aaaffdbb80e9021bedb55d57c57b02
+export WANDB_NAME=${VERSION}
+export OUTPUT_DIR=experiments/embedding/finetune/${VERSION}
+if [ ! -d "${OUTPUT_DIR}" ]; then
+    mkdir -p "${OUTPUT_DIR}"
+fi
+torchrun --nproc_per_node ${N_NODES} \
+-m FlagEmbedding.baai_general_embedding.finetune.run \
+--output_dir ${OUTPUT_DIR} \
+--model_name_or_path /DataScience/HuggingFace/Models/BAAI/bge-large-zh-v1.5 \
+--train_data ${TRAIN_DATASET} \
+--learning_rate 1e-5 \
+--fp16 \
+--num_train_epochs ${N_EPOCH} \
+--per_device_train_batch_size ${PER_DEVICE_TRAIN_BATCH_SIZE} \
+--gradient_accumulation_steps ${GRADIENT_ACCUMULATION_STEPS} \
+--dataloader_drop_last True \
+--normlized True \
+--temperature 0.02 \
+--query_max_len 64 \
+--passage_max_len 512 \
+--train_group_size ${TRAIN_GROUP_SIZE} \
+--negatives_cross_device \
+--logging_steps 5 \
+--save_steps 50 \
+--save_total_limit 10 \
+--warmup_ratio 0.05 \
+--lr_scheduler_type cosine \
+--query_instruction_for_retrieval ""
+cp "$SCRIP_DIR/$0" ${OUTPUT_DIR}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23a3dfa8659644745046979c864d218a2b247ea4873b6968ab8be4256e35eb7c
+size 1302134568

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

runs/Jul14_11-50-34_big-megatron/events.out.tfevents.1720929038.big-megatron.1203838.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d0daa137e6a042970209f470062a318a94b136ba916b828225a5d4726e78aa0
+size 8027

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07fdc575a441f95028bd3dbc57932f2dbcd8acc2eefd72019a2e86cfd2950989
+size 5368

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff