aditizyy commited on
Commit
a069479
Β·
2 Parent(s): 17fdce9 fc668c7

Merge pull request #6 from Vriti29/patch-3

Browse files
sentence-transformers/all-mpnet-base-v2 ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled8.ipynb
3
+ Automatically generated by Colab.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1JMKmuuP0equrOr6l6oQVQbpbBnGTGvcc
6
+ """
7
+ !pip install sentence-transformers
8
+ from google.colab import files
9
+ import pandas as pd
10
+ import random
11
+ uploaded = files.upload()
12
+ file_name = list(uploaded.keys())[0]
13
+ df = pd.read_csv(file_name)
14
+
15
+ # Preview
16
+ print("πŸ“„ Preview of training data:")
17
+ print(df.head())
18
+ print(f"\nβœ… Loaded {len(df)} training pairs.")
19
+ from sentence_transformers import InputExample
20
+ train_examples = [
21
+ InputExample(texts=[row["text1"], row["text2"]], label=float(row["score"]))
22
+ for _, row in df.iterrows()
23
+ ]
24
+ from sentence_transformers import SentenceTransformer, losses
25
+ from torch.utils.data import DataLoader
26
+ model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
27
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
28
+ train_loss = losses.CosineSimilarityLoss(model)
29
+ model.fit(
30
+ train_objectives=[(train_dataloader, train_loss)],
31
+ epochs=1, # Increase to 3–5 for better results
32
+ warmup_steps=10, # Usually 10% of steps per epoch
33
+ output_path="fine-tuned-mpnet-model"
34
+ )
35
+ from sentence_transformers import SentenceTransformer
36
+ model = SentenceTransformer("fine-tuned-mpnet-model")
37
+ sentence = "This is a test sentence."
38
+ embedding = model.encode(sentence)
39
+ print(embedding.shape)
40
+ fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model")
41
+
42
+ # Example usage
43
+ embedding = fine_tuned_model.encode("This is a test sentence.")
44
+ print("πŸ”’ Embedding shape:", embedding.shape)
45
+ import os
46
+ print(os.listdir("fine-tuned-mpnet-model"))
47
+ from sentence_transformers import SentenceTransformer
48
+ from sentence_transformers.util import cos_sim
49
+ # Load base and fine-tuned models
50
+ base_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
51
+ ft_model = SentenceTransformer("fine-tuned-mpnet-model")
52
+ from sentence_transformers import SentenceTransformer
53
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
54
+ from torch.utils.data import DataLoader
55
+ fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model")
56
+ sentence = "This is a test sentence."
57
+ embedding = fine_tuned_model.encode(sentence)
58
+ print("πŸ”’ Embedding shape:", embedding.shape)
59
+
60
+ """You can now use the `fine_tuned_model` to generate embeddings for any text data. For example, you can use these embeddings for tasks like semantic search, clustering, or classification."""
61
+
62
+ from tqdm import tqdm
63
+ import numpy as np
64
+ from sentence_transformers.util import cos_sim
65
+ from sklearn.metrics import mean_squared_error
66
+ from scipy.stats import spearmanr
67
+ def evaluate_model(model, name):
68
+ embeddings1 = model.encode(sentences1, convert_to_tensor=True)
69
+ embeddings2 = model.encode(sentences2, convert_to_tensor=True)
70
+ similarities = cos_sim(embeddings1, embeddings2).diagonal().cpu().numpy()
71
+ mse = mean_squared_error(true_scores, similarities)
72
+ spearman_corr, _ = spearmanr(true_scores, similarities)
73
+ print(f"\nπŸ“‹ Evaluation: {name}")
74
+ print(f"πŸ“ CosineSim vs Human Scores: ")
75
+ print(f" β€’ MSE: {mse:.4f}")
76
+ print(f" β€’ Spearman R: {spearman_corr:.4f}")
77
+ return similarities
78
+ # Extract sentences and scores from the DataFrame
79
+ sentences1 = df['text1'].tolist()
80
+ sentences2 = df['text2'].tolist()
81
+ true_scores = df['score'].tolist()
82
+ # Evaluate both models
83
+ _ = evaluate_model(base_model, "Base MPNET")
84
+ _ = evaluate_model(ft_model, "Fine-Tuned MPNET")