<a href="https://colab.research.google.com/github/rsr2425/Simplify/blob/main/finetune_embedding_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters

In [2]:
!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml

In [3]:
!pip install datasets==3.2.0



In [4]:
import nest_asyncio

nest_asyncio.apply()

In [5]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


In [6]:
!mkdir static/
!mkdir static/training_data
!curl https://python.langchain.com/docs/tutorials/rag/ -o static/training_data/langchain_rag_tutorial.html

mkdir: cannot create directory ‘static/’: File exists
mkdir: cannot create directory ‘static/training_data’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  340k  100  340k    0     0  1649k      0 --:--:-- --:--:-- --:--:-- 1652k


In [7]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "static/training_data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)
docs = text_loader.load()
len(docs)

1

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)
training_documents = text_splitter.split_documents(text_loader.load())
len(training_documents)

81

In [9]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [10]:
# break up training documents into training, validation, and test sets
import random

# set seed for reproducibility
random.seed(42)

random.shuffle(training_documents)

training_split_documents = training_documents[:int(0.8 * len(training_documents))]
val_split_documents = training_documents[int(0.8 * len(training_documents)):int(0.9 * len(training_documents))]
test_split_documents = training_documents[int(0.9 * len(training_documents)):]

In [11]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
question_generation_chain = qa_prompt_template | qa_chat_model

In [12]:
import tqdm

async def create_questions(documents, n_questions):
  questions = {}
  contexts = {}
  for document in documents:
    question = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})
    questions[document.metadata["id"]] = question
    contexts[document.metadata["id"]] = [document.metadata["id"]]
  return questions, contexts

In [13]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

In [14]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

# Convert AIMessage objects to their string content
training_questions_serializable = {k: v.content for k, v in training_questions.items()}

train_dataset = {
    "questions": training_questions_serializable,
    "relevant_contexts": training_relevant_contexts,
    "corpus": training_corpus
}

with open("static/training_data/training_dataset.jsonl", "w") as f:
    json.dump(train_dataset, f)

val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

# Convert AIMessage objects to their string content
val_questions_serializable = {k: v.content for k, v in val_questions.items()}

val_dataset = {
    "questions": val_questions_serializable,
    "relevant_contexts": val_relevant_contexts,
    "corpus": val_corpus
}

with open("static/training_data/val_dataset.jsonl", "w") as f:
    json.dump(val_dataset, f)

test_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

# Convert AIMessage objects to their string content
test_questions_serializable = {k: v.content for k, v in test_questions.items()}

test_dataset = {
    "questions": test_questions_serializable,
    "relevant_contexts": test_relevant_contexts,
    "corpus": test_corpus
}

with open("static/training_data/test_dataset.jsonl", "w") as f:
    json.dump(test_dataset, f)

In [15]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [16]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

BATCH_SIZE = 16

loader = DataLoader(
    examples,
    batch_size=BATCH_SIZE,
)

from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [17]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [18]:
EPOCHS = 10

In [19]:
import wandb
wandb.init(mode="disabled")

In [20]:
import datasets

warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
4,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
12,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
16,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
20,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
24,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
28,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
32,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
36,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
40,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
hf_username = "Rsr2425"
model.push_to_hub(f"{hf_username}/simplify-ft-arctic-embed-l")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/Rsr2425/simplify-ft-arctic-embed-l/commit/37aabf4121bb61afd4bb2ae6d48d09e695b947dd'