Spaces:
Sleeping
Sleeping
Merge pull request #6 from Vriti29/patch-3
Browse files
sentence-transformers/all-mpnet-base-v2
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Untitled8.ipynb
|
3 |
+
Automatically generated by Colab.
|
4 |
+
Original file is located at
|
5 |
+
https://colab.research.google.com/drive/1JMKmuuP0equrOr6l6oQVQbpbBnGTGvcc
|
6 |
+
"""
|
7 |
+
!pip install sentence-transformers
|
8 |
+
from google.colab import files
|
9 |
+
import pandas as pd
|
10 |
+
import random
|
11 |
+
uploaded = files.upload()
|
12 |
+
file_name = list(uploaded.keys())[0]
|
13 |
+
df = pd.read_csv(file_name)
|
14 |
+
|
15 |
+
# Preview
|
16 |
+
print("π Preview of training data:")
|
17 |
+
print(df.head())
|
18 |
+
print(f"\nβ
Loaded {len(df)} training pairs.")
|
19 |
+
from sentence_transformers import InputExample
|
20 |
+
train_examples = [
|
21 |
+
InputExample(texts=[row["text1"], row["text2"]], label=float(row["score"]))
|
22 |
+
for _, row in df.iterrows()
|
23 |
+
]
|
24 |
+
from sentence_transformers import SentenceTransformer, losses
|
25 |
+
from torch.utils.data import DataLoader
|
26 |
+
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
27 |
+
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
|
28 |
+
train_loss = losses.CosineSimilarityLoss(model)
|
29 |
+
model.fit(
|
30 |
+
train_objectives=[(train_dataloader, train_loss)],
|
31 |
+
epochs=1, # Increase to 3β5 for better results
|
32 |
+
warmup_steps=10, # Usually 10% of steps per epoch
|
33 |
+
output_path="fine-tuned-mpnet-model"
|
34 |
+
)
|
35 |
+
from sentence_transformers import SentenceTransformer
|
36 |
+
model = SentenceTransformer("fine-tuned-mpnet-model")
|
37 |
+
sentence = "This is a test sentence."
|
38 |
+
embedding = model.encode(sentence)
|
39 |
+
print(embedding.shape)
|
40 |
+
fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model")
|
41 |
+
|
42 |
+
# Example usage
|
43 |
+
embedding = fine_tuned_model.encode("This is a test sentence.")
|
44 |
+
print("π’ Embedding shape:", embedding.shape)
|
45 |
+
import os
|
46 |
+
print(os.listdir("fine-tuned-mpnet-model"))
|
47 |
+
from sentence_transformers import SentenceTransformer
|
48 |
+
from sentence_transformers.util import cos_sim
|
49 |
+
# Load base and fine-tuned models
|
50 |
+
base_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
51 |
+
ft_model = SentenceTransformer("fine-tuned-mpnet-model")
|
52 |
+
from sentence_transformers import SentenceTransformer
|
53 |
+
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
|
54 |
+
from torch.utils.data import DataLoader
|
55 |
+
fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model")
|
56 |
+
sentence = "This is a test sentence."
|
57 |
+
embedding = fine_tuned_model.encode(sentence)
|
58 |
+
print("π’ Embedding shape:", embedding.shape)
|
59 |
+
|
60 |
+
"""You can now use the `fine_tuned_model` to generate embeddings for any text data. For example, you can use these embeddings for tasks like semantic search, clustering, or classification."""
|
61 |
+
|
62 |
+
from tqdm import tqdm
|
63 |
+
import numpy as np
|
64 |
+
from sentence_transformers.util import cos_sim
|
65 |
+
from sklearn.metrics import mean_squared_error
|
66 |
+
from scipy.stats import spearmanr
|
67 |
+
def evaluate_model(model, name):
|
68 |
+
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
|
69 |
+
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
|
70 |
+
similarities = cos_sim(embeddings1, embeddings2).diagonal().cpu().numpy()
|
71 |
+
mse = mean_squared_error(true_scores, similarities)
|
72 |
+
spearman_corr, _ = spearmanr(true_scores, similarities)
|
73 |
+
print(f"\nπ Evaluation: {name}")
|
74 |
+
print(f"π CosineSim vs Human Scores: ")
|
75 |
+
print(f" β’ MSE: {mse:.4f}")
|
76 |
+
print(f" β’ Spearman R: {spearman_corr:.4f}")
|
77 |
+
return similarities
|
78 |
+
# Extract sentences and scores from the DataFrame
|
79 |
+
sentences1 = df['text1'].tolist()
|
80 |
+
sentences2 = df['text2'].tolist()
|
81 |
+
true_scores = df['score'].tolist()
|
82 |
+
# Evaluate both models
|
83 |
+
_ = evaluate_model(base_model, "Base MPNET")
|
84 |
+
_ = evaluate_model(ft_model, "Fine-Tuned MPNET")
|