File size: 3,816 Bytes
9307eee
 
c75b96e
 
 
 
 
 
 
9307eee
 
 
 
 
 
 
 
 
 
 
c75b96e
9307eee
190ce84
 
f167063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9307eee
 
 
c75b96e
 
 
 
 
 
9307eee
 
c75b96e
 
9307eee
c75b96e
9307eee
c75b96e
 
9307eee
c75b96e
 
 
 
 
9307eee
c75b96e
 
 
9307eee
c75b96e
 
9307eee
c75b96e
 
9307eee
c75b96e
 
 
9307eee
c75b96e
 
 
9307eee
 
c75b96e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
---
library_name: transformers
license: apache-2.0
datasets:
- jaeyong2/Ko-emb-PreView
language:
- ko
base_model:
- Alibaba-NLP/gte-multilingual-base
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->



## Model Details


## Train

- H/W : colab A100 40GB
- Data : jaeyong2/Ko-emb-PreView

```
model_name = "Alibaba-NLP/gte-multilingual-base"
dataset = datasets.load_dataset("jaeyong2/Ko-emb-PreView")
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(torch.bfloat16)
triplet_loss = TripletLoss(margin=1.0)

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # 에포크 반복
    model.train()
    total_loss = 0
    count = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        loss = None
        for index in range(len(batch["context"])):
            anchor_encodings = tokenizer([batch["context"][index]], truncation=True, padding="max_length", max_length=4096, return_tensors="pt")
            positive_encodings = tokenizer([batch["Title"][index]], truncation=True, padding="max_length", max_length=256, return_tensors="pt")
            negative_encodings = tokenizer([batch["Fake Title"][index]], truncation=True, padding="max_length", max_length=256, return_tensors="pt")

            anchor_encodings = batch_to_device(anchor_encodings, device)
            positive_encodings = batch_to_device(positive_encodings, device)
            negative_encodings = batch_to_device(negative_encodings, device)

            # 모델 출력 (임베딩 벡터 생성)
            anchor_output = model(**anchor_encodings)[0][:, 0, :]  # [CLS] 토큰의 벡터
            positive_output = model(**positive_encodings)[0][:, 0, :]
            negative_output = model(**negative_encodings)[0][:, 0, :]
            # 삼중항 손실 계산
            if loss==None:
                loss = triplet_loss(anchor_output, positive_output, negative_output)
            else:
                loss += triplet_loss(anchor_output, positive_output, negative_output)
        loss /= len(batch["context"])
        loss.backward()
        optimizer.step()
```

## Evaluation

Code : 
```
import torch
import numpy as np
from sklearn.metrics import pairwise_distances
from tqdm import tqdm


dataset = datasets.load_dataset("jaeyong2/Ko-emb-PreView")
validation_dataset = dataset["test"].select(range((1000)))

model.eval()

def evaluate(validation_dataset):
    correct_count = 0

    for item in tqdm(validation_dataset):
        query_embedding = get_embedding(item["context"], model, tokenizer)
        document_embedding = get_embedding(item["Title"], model, tokenizer)
        negative_embedding = get_embedding(item["Fake Title"], model, tokenizer)
      

        # 쿼리와 모든 문서 간의 유사도 계산 (코사인 거리 사용)
        positive_distances = pairwise_distances(query_embedding.detach().cpu().float().numpy(), document_embedding.detach().cpu().float().numpy(), metric="cosine")
        negative_distances = pairwise_distances(query_embedding.detach().cpu().float().numpy(), negative_embedding.detach().cpu().float().numpy(), metric="cosine")

        if positive_distances < negative_distances:
            correct_count += 1

    accuracy = correct_count / len(validation_dataset)
    return accuracy

results = evaluate(validation_dataset)
print(f"Validation Results: {results}")
```

Accuracy
- Alibaba-NLP/gte-multilingual-base : 0.974
- jaeyong2/gte-multilingual-base-Ko-embedding : 0.981


### License
- Alibaba-NLP/gte-multilingual-base : https://choosealicense.com/licenses/apache-2.0/