Subiendo modelo inicial
Browse files- 1_Pooling/config.json +10 -0
- README.md +485 -0
- config.json +31 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- optimizer.pt +3 -0
- rng_state.pth +3 -0
- scheduler.pt +3 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +58 -0
- trainer_state.json +105 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": true,
|
4 |
+
"pooling_mode_mean_tokens": false,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:6552
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: BAAI/bge-small-en-v1.5
|
10 |
+
widget:
|
11 |
+
- source_sentence: What property is denoted as the M→M property in the queueing network
|
12 |
+
literature?
|
13 |
+
sentences:
|
14 |
+
- 'The LOFAR system introduces two additional levels in the beam hierarchy: the
|
15 |
+
compound (tile) beam and the station beam.'
|
16 |
+
- The desired pseudonoise sequence in a CDMA system has the characteristics that
|
17 |
+
the fraction of 0's and 1's is almost half-and-half over the period, and the shifted
|
18 |
+
versions of the pseudonoise sequence are nearly orthogonal to each other. If the
|
19 |
+
shift of the pseudonoise sequence is randomized, it becomes a random process.
|
20 |
+
- The M→M property in the queueing network literature denotes the independence of
|
21 |
+
individual queues in the long term.
|
22 |
+
- source_sentence: Which type of channel condition has better path loss exponent (PLE)
|
23 |
+
in terms of AA (air to air) and AG (air to ground) propagation channels?
|
24 |
+
sentences:
|
25 |
+
- The goal of the Fixed Access Information API is to provide access network related
|
26 |
+
information for the multitude of fixed access technologies.
|
27 |
+
- Error mitigation is a technique to reduce the impact of errors in near-term quantum
|
28 |
+
systems without requiring full fault-tolerant quantum codes.
|
29 |
+
- From the document, it is mentioned that the AA channel has better conditions than
|
30 |
+
the AG channel in terms of path loss exponent (PLE).
|
31 |
+
- source_sentence: What is the goal of a functionality extraction attack?
|
32 |
+
sentences:
|
33 |
+
- Deep learning can automatically extract high-level features from data, reducing
|
34 |
+
the need for manual feature engineering.
|
35 |
+
- The goal of a functionality extraction attack is to create knock-off models that
|
36 |
+
mimic the behavior of an existing machine learning model.
|
37 |
+
- The main advantage of using wind turbine towers for communication is that they
|
38 |
+
already have a reliable power grid connection.
|
39 |
+
- source_sentence: What is MTU?
|
40 |
+
sentences:
|
41 |
+
- The worst-case complexity of average consensus is exponential in the number of
|
42 |
+
nodes, but it can be reduced to linear if an upper bound on the total number of
|
43 |
+
nodes is known.
|
44 |
+
- In a normally clad fiber, at long wavelengths, the MFD is large compared to the
|
45 |
+
core diameter and the electric field extends far into the cladding region.
|
46 |
+
- MTU (Maximum Transmission Unit) represents the largest size of a data packet that
|
47 |
+
can be sent over a network without fragmentation.
|
48 |
+
- source_sentence: What should the AP or PCP do if it is not decentralized AP or PCP
|
49 |
+
clustering capable or a decentralized AP or PCP cluster is not present?
|
50 |
+
sentences:
|
51 |
+
- When a Data, Management or Extension frame is received, a STA inserts it in an
|
52 |
+
appropriate cache.
|
53 |
+
- If the AP or PCP is not decentralized AP or PCP clustering capable or a decentralized
|
54 |
+
AP or PCP cluster is not present, it should set its Cluster Member Role to 0 (not
|
55 |
+
currently participating in a cluster) and remain unclustered.
|
56 |
+
- Analog beamforming based on slowly-varying second order statistics of the CSI
|
57 |
+
reduces the dimension of the effective instantaneous CSI for digital beamforming
|
58 |
+
within each coherent fading block, which helps to relieve the signaling overhead.
|
59 |
+
datasets:
|
60 |
+
- dinho1597/Telecom-QA-MultipleChoice
|
61 |
+
pipeline_tag: sentence-similarity
|
62 |
+
library_name: sentence-transformers
|
63 |
+
metrics:
|
64 |
+
- cosine_accuracy@1
|
65 |
+
- cosine_accuracy@3
|
66 |
+
- cosine_accuracy@5
|
67 |
+
- cosine_accuracy@10
|
68 |
+
- cosine_precision@1
|
69 |
+
- cosine_recall@1
|
70 |
+
- cosine_ndcg@10
|
71 |
+
- cosine_mrr@10
|
72 |
+
- cosine_map@100
|
73 |
+
model-index:
|
74 |
+
- name: SentenceTransformer based on BAAI/bge-small-en-v1.5
|
75 |
+
results:
|
76 |
+
- task:
|
77 |
+
type: information-retrieval
|
78 |
+
name: Information Retrieval
|
79 |
+
dataset:
|
80 |
+
name: telecom ir eval
|
81 |
+
type: telecom-ir-eval
|
82 |
+
metrics:
|
83 |
+
- type: cosine_accuracy@1
|
84 |
+
value: 0.965675057208238
|
85 |
+
name: Cosine Accuracy@1
|
86 |
+
- type: cosine_accuracy@3
|
87 |
+
value: 0.992372234935164
|
88 |
+
name: Cosine Accuracy@3
|
89 |
+
- type: cosine_accuracy@5
|
90 |
+
value: 0.9931350114416476
|
91 |
+
name: Cosine Accuracy@5
|
92 |
+
- type: cosine_accuracy@10
|
93 |
+
value: 0.9938977879481312
|
94 |
+
name: Cosine Accuracy@10
|
95 |
+
- type: cosine_precision@1
|
96 |
+
value: 0.965675057208238
|
97 |
+
name: Cosine Precision@1
|
98 |
+
- type: cosine_recall@1
|
99 |
+
value: 0.965675057208238
|
100 |
+
name: Cosine Recall@1
|
101 |
+
- type: cosine_ndcg@10
|
102 |
+
value: 0.9824027787882591
|
103 |
+
name: Cosine Ndcg@10
|
104 |
+
- type: cosine_mrr@10
|
105 |
+
value: 0.9784334023464457
|
106 |
+
name: Cosine Mrr@10
|
107 |
+
- type: cosine_map@100
|
108 |
+
value: 0.9786169716375667
|
109 |
+
name: Cosine Map@100
|
110 |
+
---
|
111 |
+
|
112 |
+
# SentenceTransformer based on BAAI/bge-small-en-v1.5
|
113 |
+
|
114 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) on the [telecom-qa-multiple_choice](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice) dataset. It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
115 |
+
|
116 |
+
## Model Details
|
117 |
+
|
118 |
+
### Model Description
|
119 |
+
- **Model Type:** Sentence Transformer
|
120 |
+
- **Base model:** [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <!-- at revision 5c38ec7c405ec4b44b94cc5a9bb96e735b38267a -->
|
121 |
+
- **Maximum Sequence Length:** 512 tokens
|
122 |
+
- **Output Dimensionality:** 384 dimensions
|
123 |
+
- **Similarity Function:** Cosine Similarity
|
124 |
+
- **Training Dataset:**
|
125 |
+
- [telecom-qa-multiple_choice](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice)
|
126 |
+
<!-- - **Language:** Unknown -->
|
127 |
+
<!-- - **License:** Unknown -->
|
128 |
+
|
129 |
+
### Model Sources
|
130 |
+
|
131 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
132 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
133 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
134 |
+
|
135 |
+
### Full Model Architecture
|
136 |
+
|
137 |
+
```
|
138 |
+
SentenceTransformer(
|
139 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel
|
140 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
141 |
+
(2): Normalize()
|
142 |
+
)
|
143 |
+
```
|
144 |
+
|
145 |
+
## Usage
|
146 |
+
|
147 |
+
### Direct Usage (Sentence Transformers)
|
148 |
+
|
149 |
+
First install the Sentence Transformers library:
|
150 |
+
|
151 |
+
```bash
|
152 |
+
pip install -U sentence-transformers
|
153 |
+
```
|
154 |
+
|
155 |
+
Then you can load this model and run inference.
|
156 |
+
```python
|
157 |
+
from sentence_transformers import SentenceTransformer
|
158 |
+
|
159 |
+
# Download from the 🤗 Hub
|
160 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
161 |
+
# Run inference
|
162 |
+
sentences = [
|
163 |
+
'What should the AP or PCP do if it is not decentralized AP or PCP clustering capable or a decentralized AP or PCP cluster is not present?',
|
164 |
+
'If the AP or PCP is not decentralized AP or PCP clustering capable or a decentralized AP or PCP cluster is not present, it should set its Cluster Member Role to 0 (not currently participating in a cluster) and remain unclustered.',
|
165 |
+
'When a Data, Management or Extension frame is received, a STA inserts it in an appropriate cache.',
|
166 |
+
]
|
167 |
+
embeddings = model.encode(sentences)
|
168 |
+
print(embeddings.shape)
|
169 |
+
# [3, 384]
|
170 |
+
|
171 |
+
# Get the similarity scores for the embeddings
|
172 |
+
similarities = model.similarity(embeddings, embeddings)
|
173 |
+
print(similarities.shape)
|
174 |
+
# [3, 3]
|
175 |
+
```
|
176 |
+
|
177 |
+
<!--
|
178 |
+
### Direct Usage (Transformers)
|
179 |
+
|
180 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
181 |
+
|
182 |
+
</details>
|
183 |
+
-->
|
184 |
+
|
185 |
+
<!--
|
186 |
+
### Downstream Usage (Sentence Transformers)
|
187 |
+
|
188 |
+
You can finetune this model on your own dataset.
|
189 |
+
|
190 |
+
<details><summary>Click to expand</summary>
|
191 |
+
|
192 |
+
</details>
|
193 |
+
-->
|
194 |
+
|
195 |
+
<!--
|
196 |
+
### Out-of-Scope Use
|
197 |
+
|
198 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
199 |
+
-->
|
200 |
+
|
201 |
+
## Evaluation
|
202 |
+
|
203 |
+
### Metrics
|
204 |
+
|
205 |
+
#### Information Retrieval
|
206 |
+
|
207 |
+
* Dataset: `telecom-ir-eval`
|
208 |
+
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
|
209 |
+
|
210 |
+
| Metric | Value |
|
211 |
+
|:-------------------|:-----------|
|
212 |
+
| cosine_accuracy@1 | 0.9657 |
|
213 |
+
| cosine_accuracy@3 | 0.9924 |
|
214 |
+
| cosine_accuracy@5 | 0.9931 |
|
215 |
+
| cosine_accuracy@10 | 0.9939 |
|
216 |
+
| cosine_precision@1 | 0.9657 |
|
217 |
+
| cosine_recall@1 | 0.9657 |
|
218 |
+
| **cosine_ndcg@10** | **0.9824** |
|
219 |
+
| cosine_mrr@10 | 0.9784 |
|
220 |
+
| cosine_map@100 | 0.9786 |
|
221 |
+
|
222 |
+
<!--
|
223 |
+
## Bias, Risks and Limitations
|
224 |
+
|
225 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
226 |
+
-->
|
227 |
+
|
228 |
+
<!--
|
229 |
+
### Recommendations
|
230 |
+
|
231 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
232 |
+
-->
|
233 |
+
|
234 |
+
## Training Details
|
235 |
+
|
236 |
+
### Training Dataset
|
237 |
+
|
238 |
+
#### telecom-qa-multiple_choice
|
239 |
+
|
240 |
+
* Dataset: [telecom-qa-multiple_choice](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice) at [73aebbb](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice/tree/73aebbb16651212e4b1947ac0d64fc80a6bc9398)
|
241 |
+
* Size: 6,552 training samples
|
242 |
+
* Columns: <code>anchor</code> and <code>positive</code>
|
243 |
+
* Approximate statistics based on the first 1000 samples:
|
244 |
+
| | anchor | positive |
|
245 |
+
|:--------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|
|
246 |
+
| type | string | string |
|
247 |
+
| details | <ul><li>min: 4 tokens</li><li>mean: 18.95 tokens</li><li>max: 49 tokens</li></ul> | <ul><li>min: 9 tokens</li><li>mean: 29.33 tokens</li><li>max: 112 tokens</li></ul> |
|
248 |
+
* Samples:
|
249 |
+
| anchor | positive |
|
250 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
251 |
+
| <code>What is the goal of a jammer in a mobile edge caching system?</code> | <code>The goal of a jammer in a mobile edge caching system is to interrupt ongoing radio transmissions of the edge node with cached chunks or caching users and prevent access to cached content. Additionally, jammers aim to deplete the resources of edge nodes, caching users, and sensors during failed communication attempts.</code> |
|
252 |
+
| <code>Which type of DRL uses DNNs (Deep Neural Networks) to fit action values and employs experience replay and target networks to ensure stable training convergence?</code> | <code>Value-based DRL, such as Deep Q-Learning (DQL), uses DNNs to fit action values and employs experience replay and target networks to ensure stable training convergence.</code> |
|
253 |
+
| <code>What is the relationship between the curvature of the decision boundary and the robustness of a network?</code> | <code>The lower the curvature of the decision boundaries, the more robust the network.</code> |
|
254 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
255 |
+
```json
|
256 |
+
{
|
257 |
+
"scale": 20.0,
|
258 |
+
"similarity_fct": "cos_sim"
|
259 |
+
}
|
260 |
+
```
|
261 |
+
|
262 |
+
### Evaluation Dataset
|
263 |
+
|
264 |
+
#### telecom-qa-multiple_choice
|
265 |
+
|
266 |
+
* Dataset: [telecom-qa-multiple_choice](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice) at [73aebbb](https://huggingface.co/datasets/dinho1597/Telecom-QA-MultipleChoice/tree/73aebbb16651212e4b1947ac0d64fc80a6bc9398)
|
267 |
+
* Size: 6,552 evaluation samples
|
268 |
+
* Columns: <code>anchor</code> and <code>positive</code>
|
269 |
+
* Approximate statistics based on the first 1000 samples:
|
270 |
+
| | anchor | positive |
|
271 |
+
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
|
272 |
+
| type | string | string |
|
273 |
+
| details | <ul><li>min: 4 tokens</li><li>mean: 18.87 tokens</li><li>max: 56 tokens</li></ul> | <ul><li>min: 8 tokens</li><li>mean: 29.45 tokens</li><li>max: 91 tokens</li></ul> |
|
274 |
+
* Samples:
|
275 |
+
| anchor | positive |
|
276 |
+
|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
277 |
+
| <code>Which forward error correction (FEC) codes are available for the THz single carrier mode?</code> | <code>The THz single carrier mode (THz-SC PHY) in the IEEE 802.15.3d standard supports two low-density parity-check (LDPC) codes: 14/15 LDPC (1440,1344) and 11/15 LDPC (1440,1056).</code> |
|
278 |
+
| <code>Which multiple access technique allows users to access the channel simultaneously using the same frequency and time resources, with different power levels?</code> | <code>Non-Orthogonal Multiple Access (NOMA) allows users to access the channel simultaneously using the same frequency and time resources, but with different power levels.</code> |
|
279 |
+
| <code>What is the power gain when doubling the number of antennas?</code> | <code>Doubling the number of antennas yields a 3-dB power gain.</code> |
|
280 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
281 |
+
```json
|
282 |
+
{
|
283 |
+
"scale": 20.0,
|
284 |
+
"similarity_fct": "cos_sim"
|
285 |
+
}
|
286 |
+
```
|
287 |
+
|
288 |
+
### Training Hyperparameters
|
289 |
+
#### Non-Default Hyperparameters
|
290 |
+
|
291 |
+
- `eval_strategy`: steps
|
292 |
+
- `per_device_train_batch_size`: 512
|
293 |
+
- `per_device_eval_batch_size`: 512
|
294 |
+
- `weight_decay`: 0.01
|
295 |
+
- `num_train_epochs`: 15
|
296 |
+
- `lr_scheduler_type`: cosine_with_restarts
|
297 |
+
- `warmup_ratio`: 0.1
|
298 |
+
- `fp16`: True
|
299 |
+
- `load_best_model_at_end`: True
|
300 |
+
- `batch_sampler`: no_duplicates
|
301 |
+
|
302 |
+
#### All Hyperparameters
|
303 |
+
<details><summary>Click to expand</summary>
|
304 |
+
|
305 |
+
- `overwrite_output_dir`: False
|
306 |
+
- `do_predict`: False
|
307 |
+
- `eval_strategy`: steps
|
308 |
+
- `prediction_loss_only`: True
|
309 |
+
- `per_device_train_batch_size`: 512
|
310 |
+
- `per_device_eval_batch_size`: 512
|
311 |
+
- `per_gpu_train_batch_size`: None
|
312 |
+
- `per_gpu_eval_batch_size`: None
|
313 |
+
- `gradient_accumulation_steps`: 1
|
314 |
+
- `eval_accumulation_steps`: None
|
315 |
+
- `torch_empty_cache_steps`: None
|
316 |
+
- `learning_rate`: 5e-05
|
317 |
+
- `weight_decay`: 0.01
|
318 |
+
- `adam_beta1`: 0.9
|
319 |
+
- `adam_beta2`: 0.999
|
320 |
+
- `adam_epsilon`: 1e-08
|
321 |
+
- `max_grad_norm`: 1.0
|
322 |
+
- `num_train_epochs`: 15
|
323 |
+
- `max_steps`: -1
|
324 |
+
- `lr_scheduler_type`: cosine_with_restarts
|
325 |
+
- `lr_scheduler_kwargs`: {}
|
326 |
+
- `warmup_ratio`: 0.1
|
327 |
+
- `warmup_steps`: 0
|
328 |
+
- `log_level`: passive
|
329 |
+
- `log_level_replica`: warning
|
330 |
+
- `log_on_each_node`: True
|
331 |
+
- `logging_nan_inf_filter`: True
|
332 |
+
- `save_safetensors`: True
|
333 |
+
- `save_on_each_node`: False
|
334 |
+
- `save_only_model`: False
|
335 |
+
- `restore_callback_states_from_checkpoint`: False
|
336 |
+
- `no_cuda`: False
|
337 |
+
- `use_cpu`: False
|
338 |
+
- `use_mps_device`: False
|
339 |
+
- `seed`: 42
|
340 |
+
- `data_seed`: None
|
341 |
+
- `jit_mode_eval`: False
|
342 |
+
- `use_ipex`: False
|
343 |
+
- `bf16`: False
|
344 |
+
- `fp16`: True
|
345 |
+
- `fp16_opt_level`: O1
|
346 |
+
- `half_precision_backend`: auto
|
347 |
+
- `bf16_full_eval`: False
|
348 |
+
- `fp16_full_eval`: False
|
349 |
+
- `tf32`: None
|
350 |
+
- `local_rank`: 0
|
351 |
+
- `ddp_backend`: None
|
352 |
+
- `tpu_num_cores`: None
|
353 |
+
- `tpu_metrics_debug`: False
|
354 |
+
- `debug`: []
|
355 |
+
- `dataloader_drop_last`: False
|
356 |
+
- `dataloader_num_workers`: 0
|
357 |
+
- `dataloader_prefetch_factor`: None
|
358 |
+
- `past_index`: -1
|
359 |
+
- `disable_tqdm`: False
|
360 |
+
- `remove_unused_columns`: True
|
361 |
+
- `label_names`: None
|
362 |
+
- `load_best_model_at_end`: True
|
363 |
+
- `ignore_data_skip`: False
|
364 |
+
- `fsdp`: []
|
365 |
+
- `fsdp_min_num_params`: 0
|
366 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
367 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
368 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
369 |
+
- `deepspeed`: None
|
370 |
+
- `label_smoothing_factor`: 0.0
|
371 |
+
- `optim`: adamw_torch
|
372 |
+
- `optim_args`: None
|
373 |
+
- `adafactor`: False
|
374 |
+
- `group_by_length`: False
|
375 |
+
- `length_column_name`: length
|
376 |
+
- `ddp_find_unused_parameters`: None
|
377 |
+
- `ddp_bucket_cap_mb`: None
|
378 |
+
- `ddp_broadcast_buffers`: False
|
379 |
+
- `dataloader_pin_memory`: True
|
380 |
+
- `dataloader_persistent_workers`: False
|
381 |
+
- `skip_memory_metrics`: True
|
382 |
+
- `use_legacy_prediction_loop`: False
|
383 |
+
- `push_to_hub`: False
|
384 |
+
- `resume_from_checkpoint`: None
|
385 |
+
- `hub_model_id`: None
|
386 |
+
- `hub_strategy`: every_save
|
387 |
+
- `hub_private_repo`: None
|
388 |
+
- `hub_always_push`: False
|
389 |
+
- `gradient_checkpointing`: False
|
390 |
+
- `gradient_checkpointing_kwargs`: None
|
391 |
+
- `include_inputs_for_metrics`: False
|
392 |
+
- `include_for_metrics`: []
|
393 |
+
- `eval_do_concat_batches`: True
|
394 |
+
- `fp16_backend`: auto
|
395 |
+
- `push_to_hub_model_id`: None
|
396 |
+
- `push_to_hub_organization`: None
|
397 |
+
- `mp_parameters`:
|
398 |
+
- `auto_find_batch_size`: False
|
399 |
+
- `full_determinism`: False
|
400 |
+
- `torchdynamo`: None
|
401 |
+
- `ray_scope`: last
|
402 |
+
- `ddp_timeout`: 1800
|
403 |
+
- `torch_compile`: False
|
404 |
+
- `torch_compile_backend`: None
|
405 |
+
- `torch_compile_mode`: None
|
406 |
+
- `dispatch_batches`: None
|
407 |
+
- `split_batches`: None
|
408 |
+
- `include_tokens_per_second`: False
|
409 |
+
- `include_num_input_tokens_seen`: False
|
410 |
+
- `neftune_noise_alpha`: None
|
411 |
+
- `optim_target_modules`: None
|
412 |
+
- `batch_eval_metrics`: False
|
413 |
+
- `eval_on_start`: False
|
414 |
+
- `use_liger_kernel`: False
|
415 |
+
- `eval_use_gather_object`: False
|
416 |
+
- `average_tokens_across_devices`: False
|
417 |
+
- `prompts`: None
|
418 |
+
- `batch_sampler`: no_duplicates
|
419 |
+
- `multi_dataset_batch_sampler`: proportional
|
420 |
+
|
421 |
+
</details>
|
422 |
+
|
423 |
+
### Training Logs
|
424 |
+
| Epoch | Step | Training Loss | Validation Loss | telecom-ir-eval_cosine_ndcg@10 |
|
425 |
+
|:------:|:----:|:-------------:|:---------------:|:------------------------------:|
|
426 |
+
| 1.2727 | 15 | 1.0332 | 0.0968 | 0.9725 |
|
427 |
+
| 2.5455 | 30 | 0.2091 | 0.0518 | 0.9808 |
|
428 |
+
| 3.8182 | 45 | 0.0997 | 0.0470 | 0.9824 |
|
429 |
+
|
430 |
+
|
431 |
+
### Framework Versions
|
432 |
+
- Python: 3.10.12
|
433 |
+
- Sentence Transformers: 3.3.1
|
434 |
+
- Transformers: 4.47.1
|
435 |
+
- PyTorch: 2.5.1+cu121
|
436 |
+
- Accelerate: 1.2.1
|
437 |
+
- Datasets: 3.2.0
|
438 |
+
- Tokenizers: 0.21.0
|
439 |
+
|
440 |
+
## Citation
|
441 |
+
|
442 |
+
### BibTeX
|
443 |
+
|
444 |
+
#### Sentence Transformers
|
445 |
+
```bibtex
|
446 |
+
@inproceedings{reimers-2019-sentence-bert,
|
447 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
448 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
449 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
450 |
+
month = "11",
|
451 |
+
year = "2019",
|
452 |
+
publisher = "Association for Computational Linguistics",
|
453 |
+
url = "https://arxiv.org/abs/1908.10084",
|
454 |
+
}
|
455 |
+
```
|
456 |
+
|
457 |
+
#### MultipleNegativesRankingLoss
|
458 |
+
```bibtex
|
459 |
+
@misc{henderson2017efficient,
|
460 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
461 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
462 |
+
year={2017},
|
463 |
+
eprint={1705.00652},
|
464 |
+
archivePrefix={arXiv},
|
465 |
+
primaryClass={cs.CL}
|
466 |
+
}
|
467 |
+
```
|
468 |
+
|
469 |
+
<!--
|
470 |
+
## Glossary
|
471 |
+
|
472 |
+
*Clearly define terms in order to be accessible across audiences.*
|
473 |
+
-->
|
474 |
+
|
475 |
+
<!--
|
476 |
+
## Model Card Authors
|
477 |
+
|
478 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
479 |
+
-->
|
480 |
+
|
481 |
+
<!--
|
482 |
+
## Model Card Contact
|
483 |
+
|
484 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
485 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "BAAI/bge-small-en-v1.5",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 384,
|
11 |
+
"id2label": {
|
12 |
+
"0": "LABEL_0"
|
13 |
+
},
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 1536,
|
16 |
+
"label2id": {
|
17 |
+
"LABEL_0": 0
|
18 |
+
},
|
19 |
+
"layer_norm_eps": 1e-12,
|
20 |
+
"max_position_embeddings": 512,
|
21 |
+
"model_type": "bert",
|
22 |
+
"num_attention_heads": 12,
|
23 |
+
"num_hidden_layers": 12,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"position_embedding_type": "absolute",
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.47.1",
|
28 |
+
"type_vocab_size": 2,
|
29 |
+
"use_cache": true,
|
30 |
+
"vocab_size": 30522
|
31 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.3.1",
|
4 |
+
"transformers": "4.47.1",
|
5 |
+
"pytorch": "2.5.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6a17ff064cf7dd59cd00dd009c029e97aa67812ae08b28555b86cae53de9346
|
3 |
+
size 133462128
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b346d73e533eeed9898071d47878cf25c882b0a4865130ec649224ad1001cbc
|
3 |
+
size 265862074
|
rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b453c3c68d6d9de2e7549bd1ddfa70e40f298095123abbca773deb02b1581f6
|
3 |
+
size 14244
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eeabf30df830deb924a29cbbe61710ac215a8a05461da8f799581c65f9d2e510
|
3 |
+
size 1064
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": true
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"model_max_length": 512,
|
51 |
+
"never_split": null,
|
52 |
+
"pad_token": "[PAD]",
|
53 |
+
"sep_token": "[SEP]",
|
54 |
+
"strip_accents": null,
|
55 |
+
"tokenize_chinese_chars": true,
|
56 |
+
"tokenizer_class": "BertTokenizer",
|
57 |
+
"unk_token": "[UNK]"
|
58 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.965675057208238,
|
3 |
+
"best_model_checkpoint": "/content/drive/MyDrive/Papers/RAG_3GPP/models/checkpoints/embedding/bge-small-telecom_15e_512bs/checkpoint-45",
|
4 |
+
"epoch": 3.8181818181818183,
|
5 |
+
"eval_steps": 15,
|
6 |
+
"global_step": 45,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.2727272727272727,
|
13 |
+
"grad_norm": 1.7355202436447144,
|
14 |
+
"learning_rate": 4.411764705882353e-05,
|
15 |
+
"loss": 1.0332,
|
16 |
+
"step": 15
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.2727272727272727,
|
20 |
+
"eval_loss": 0.0967816412448883,
|
21 |
+
"eval_runtime": 3.2637,
|
22 |
+
"eval_samples_per_second": 401.687,
|
23 |
+
"eval_steps_per_second": 0.919,
|
24 |
+
"eval_telecom-ir-eval_cosine_accuracy@1": 0.9450800915331807,
|
25 |
+
"eval_telecom-ir-eval_cosine_accuracy@10": 0.9931350114416476,
|
26 |
+
"eval_telecom-ir-eval_cosine_accuracy@3": 0.9870327993897788,
|
27 |
+
"eval_telecom-ir-eval_cosine_accuracy@5": 0.988558352402746,
|
28 |
+
"eval_telecom-ir-eval_cosine_map@100": 0.96574625373806,
|
29 |
+
"eval_telecom-ir-eval_cosine_mrr@10": 0.9655954499776009,
|
30 |
+
"eval_telecom-ir-eval_cosine_ndcg@10": 0.9725340084355174,
|
31 |
+
"eval_telecom-ir-eval_cosine_precision@1": 0.9450800915331807,
|
32 |
+
"eval_telecom-ir-eval_cosine_recall@1": 0.9450800915331807,
|
33 |
+
"step": 15
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"epoch": 2.5454545454545454,
|
37 |
+
"grad_norm": 0.8801536560058594,
|
38 |
+
"learning_rate": 4.9054165035221236e-05,
|
39 |
+
"loss": 0.2091,
|
40 |
+
"step": 30
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"epoch": 2.5454545454545454,
|
44 |
+
"eval_loss": 0.05175875499844551,
|
45 |
+
"eval_runtime": 3.3397,
|
46 |
+
"eval_samples_per_second": 392.552,
|
47 |
+
"eval_steps_per_second": 0.898,
|
48 |
+
"eval_telecom-ir-eval_cosine_accuracy@1": 0.96186117467582,
|
49 |
+
"eval_telecom-ir-eval_cosine_accuracy@10": 0.9938977879481312,
|
50 |
+
"eval_telecom-ir-eval_cosine_accuracy@3": 0.9908466819221968,
|
51 |
+
"eval_telecom-ir-eval_cosine_accuracy@5": 0.992372234935164,
|
52 |
+
"eval_telecom-ir-eval_cosine_map@100": 0.976456964386964,
|
53 |
+
"eval_telecom-ir-eval_cosine_mrr@10": 0.9762734130011017,
|
54 |
+
"eval_telecom-ir-eval_cosine_ndcg@10": 0.9807745601952235,
|
55 |
+
"eval_telecom-ir-eval_cosine_precision@1": 0.96186117467582,
|
56 |
+
"eval_telecom-ir-eval_cosine_recall@1": 0.96186117467582,
|
57 |
+
"step": 30
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"epoch": 3.8181818181818183,
|
61 |
+
"grad_norm": 0.5518485307693481,
|
62 |
+
"learning_rate": 4.571274123109606e-05,
|
63 |
+
"loss": 0.0997,
|
64 |
+
"step": 45
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 3.8181818181818183,
|
68 |
+
"eval_loss": 0.046976786106824875,
|
69 |
+
"eval_runtime": 3.4455,
|
70 |
+
"eval_samples_per_second": 380.5,
|
71 |
+
"eval_steps_per_second": 0.871,
|
72 |
+
"eval_telecom-ir-eval_cosine_accuracy@1": 0.965675057208238,
|
73 |
+
"eval_telecom-ir-eval_cosine_accuracy@10": 0.9938977879481312,
|
74 |
+
"eval_telecom-ir-eval_cosine_accuracy@3": 0.992372234935164,
|
75 |
+
"eval_telecom-ir-eval_cosine_accuracy@5": 0.9931350114416476,
|
76 |
+
"eval_telecom-ir-eval_cosine_map@100": 0.9786169716375667,
|
77 |
+
"eval_telecom-ir-eval_cosine_mrr@10": 0.9784334023464457,
|
78 |
+
"eval_telecom-ir-eval_cosine_ndcg@10": 0.9824027787882591,
|
79 |
+
"eval_telecom-ir-eval_cosine_precision@1": 0.965675057208238,
|
80 |
+
"eval_telecom-ir-eval_cosine_recall@1": 0.965675057208238,
|
81 |
+
"step": 45
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"logging_steps": 15,
|
85 |
+
"max_steps": 165,
|
86 |
+
"num_input_tokens_seen": 0,
|
87 |
+
"num_train_epochs": 15,
|
88 |
+
"save_steps": 15,
|
89 |
+
"stateful_callbacks": {
|
90 |
+
"TrainerControl": {
|
91 |
+
"args": {
|
92 |
+
"should_epoch_stop": false,
|
93 |
+
"should_evaluate": false,
|
94 |
+
"should_log": false,
|
95 |
+
"should_save": true,
|
96 |
+
"should_training_stop": false
|
97 |
+
},
|
98 |
+
"attributes": {}
|
99 |
+
}
|
100 |
+
},
|
101 |
+
"total_flos": 0.0,
|
102 |
+
"train_batch_size": 512,
|
103 |
+
"trial_name": null,
|
104 |
+
"trial_params": null
|
105 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:917e4ea8b0b76de6ad48dbacba13d419d280dc7819d91087c6a3cc0040b413b7
|
3 |
+
size 5816
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|