Vira21 commited on
Commit
4adc3ad
·
verified ·
1 Parent(s): 05b43ab

Delete myutils/finetuning.py

Browse files
Files changed (1) hide show
  1. myutils/finetuning.py +0 -410
myutils/finetuning.py DELETED
@@ -1,410 +0,0 @@
1
- """
2
- finetuning_pipeline.py
3
-
4
- Collects a number of methods in classes to streamline the finetuning of model embeddings
5
-
6
-
7
- #### Fine-tuning Steps
8
-
9
- 1. Prepare Train, Val and Test Data
10
- - if needed, chunk data to get a list of LC Documents
11
- - Split the list into train, val and test sub-groups
12
- - For each sub-group, use an LLM to generate a list of POSITIVE question, context pairs.
13
- - This is done by passing the context to the LLM along with a prompt to generate `n_questions` number of questions; the questions are extracted from the LLM output and paired with the underlying context. Note that each context will have more than one question paired with it.
14
- - Write out the list of question, context pairs for train, val and test sub-groups into a jsonl file for future reference.
15
- - The train sub-group is loaded into a HF Dataset object for use in training.
16
- 2. Data Loader
17
- - Set up data loader
18
- - This includes the training data along with batch size information.
19
- 3. Load model to be finetuned
20
- - Use HF model name to load model
21
- 4. Set up loss function
22
- - concept of inner loss: MultipleNegativesRankingLoss
23
- - wrap inner loss in overall loss: MatryoshkaLoss
24
- 5. Set up finetuning pipeline
25
- - This includes data, model, loss and hyperparameters
26
- - Hyperparameters include number of epochs, warmup, etc.
27
- 6. Run the finetuning pipeline and get modified model embeddings
28
- - save these embeddings
29
- - see if these can be loaded onto HF
30
- - see if these can be downloaded from HF
31
- 7. Validation Loss
32
- - run assessment on val sub-group
33
-
34
-
35
- """
36
-
37
- # imports
38
- from operator import itemgetter
39
- import pandas as pd
40
- from typing import List
41
- import uuid
42
- import random
43
- import tqdm
44
- import re
45
- import json
46
- import pandas as pd
47
-
48
- from torch.utils.data import DataLoader
49
-
50
- from sentence_transformers import SentenceTransformer
51
- from sentence_transformers import InputExample
52
- from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
53
- from sentence_transformers.evaluation import InformationRetrievalEvaluator
54
-
55
- from langchain_community.vectorstores import FAISS
56
- from langchain_core.documents import Document
57
- from langchain_text_splitters import RecursiveCharacterTextSplitter
58
- from langchain_core.prompts import ChatPromptTemplate
59
- from langchain_openai import ChatOpenAI
60
- from langchain_core.documents import Document
61
-
62
-
63
- class GenerateQuestionsForContexts:
64
- def __init__(self,
65
- qa_chat_model_name="gpt-4o-mini",
66
- n_questions=3):
67
-
68
- self.qa_chat_model_name = qa_chat_model_name
69
- # regex pattern used to extract questions from LLM response
70
- # first group is question number - an integer - followed by a period
71
- # second group is any character that follows this
72
- self.regex_pattern = r'(^\d+).(.+)'
73
- self.n_questions = n_questions
74
-
75
- self.set_up_chat_model()
76
- self.set_up_question_generation_chain()
77
- return
78
-
79
- def get_unique_id(self, id_set):
80
- """
81
- Generate unique id not present in input set of ids
82
- Input
83
- a set of unique identifiers
84
- Returns
85
- a new unique id not in input set
86
- updated input set of ids incl the newly generated id
87
- """
88
- id = str(uuid.uuid4())
89
- while id in id_set:
90
- id = str(uuid.uuid4())
91
- id_set.add(id)
92
- return id, id_set
93
-
94
- def set_up_chat_model(self):
95
- self.qa_chat_model = ChatOpenAI(
96
- model=self.qa_chat_model_name,
97
- temperature=0
98
- )
99
- return self
100
-
101
- def set_up_question_generation_chain(self):
102
- qa_prompt = """\
103
- Given the following context, you must generate questions based on only the provided context.
104
-
105
- You are to generate {n_questions} questions which should be provided in the following format:
106
-
107
- 1. QUESTION #1
108
- 2. QUESTION #2
109
- ...
110
-
111
- Context:
112
- {context}
113
- """
114
- qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
115
- self.question_generation_chain = qa_prompt_template | self.qa_chat_model
116
- return self
117
-
118
- def create_questions(self, documents, n_questions):
119
- questions = {}
120
- relevant_docs = {}
121
-
122
- q_id_set = set()
123
- for document in tqdm.tqdm(documents): # note tqdm.tqdm (NOT just tqdm as in original notebook)
124
- this_question_set = \
125
- self.question_generation_chain.invoke(
126
- {
127
- 'context': document.page_content,
128
- 'n_questions': n_questions
129
- }
130
- )
131
- for question in this_question_set.content.split("\n"):
132
- if len(question) > 0:
133
- try:
134
- q_id, q_id_set = self.get_unique_id(q_id_set)
135
- matched_pattern = re.search(self.regex_pattern, question) # regex search for n. <question>
136
- if len(matched_pattern.group(2)) > 0:
137
- questions[q_id] = matched_pattern.group(2).strip() # extraction of question string
138
- relevant_docs[q_id] = [document.metadata["id"]]
139
- except Exception:
140
- continue
141
- return questions, relevant_docs
142
-
143
-
144
- class PrepareDataForFinetuning(GenerateQuestionsForContexts):
145
- def __init__(self,
146
- chunk_size=None, chunk_overlap=None, len_function=None,
147
- lcdocuments=None, run_optional_text_splitter=False,
148
- all_splits=None, train_val_test_size=[10, 5, 5],
149
- train_val_test_split_type='random',
150
- random_seed=69, qa_chat_model_name="gpt-4o-mini",
151
- n_questions=2, batch_size=5):
152
-
153
- super().__init__(qa_chat_model_name=qa_chat_model_name,
154
- n_questions=n_questions)
155
-
156
- self.chunk_size = chunk_size
157
- self.chunk_overlap = chunk_overlap
158
- self.len_function = len_function
159
-
160
- self.lcdocuments = lcdocuments
161
- self.run_optional_text_splitter = run_optional_text_splitter
162
-
163
- self.all_doc_splits = all_splits
164
-
165
- self.train_val_test_size = train_val_test_size
166
- self.n_train = self.train_val_test_size[0]
167
- self.n_val = self.train_val_test_size[1]
168
- self.n_test = self.train_val_test_size[2]
169
- self.train_val_test_split_type = train_val_test_split_type
170
-
171
- self.random_seed = random_seed
172
- self.batch_size = batch_size
173
- return
174
-
175
- def optional_text_splitter(self):
176
- text_splitter = RecursiveCharacterTextSplitter(
177
- chunk_size = self.chunk_size,
178
- chunk_overlap = self.chunk_overlap,
179
- length_function = self.len_function
180
- )
181
- self.all_doc_splits = text_splitter.split_documents(self.lcdocuments.load())
182
- return self
183
-
184
- def attach_unique_ids_to_docs(self):
185
- id_set = set()
186
- for docsplit in self.all_doc_splits:
187
- id, id_set = self.get_unique_id(id_set)
188
- docsplit.metadata["id"] = id
189
- return self
190
-
191
- def simple_train_val_test_splits(self):
192
- self.training_splits = self.all_doc_splits[:self.n_train]
193
- self.val_splits = self.all_doc_splits[self.n_train:self.n_train+self.n_val]
194
- self.test_splits = self.all_doc_splits[self.n_train+self.n_val:]
195
- return self
196
-
197
- def randomized_train_val_test_splits(self):
198
- # set the same seed to be able to replicate the result of
199
- # random shuffle below
200
- random.seed(self.random_seed)
201
-
202
- # randomly orders the elements in the list training_documents
203
- randomly_ordered_documents = self.all_doc_splits.copy()
204
- random.shuffle(randomly_ordered_documents)
205
-
206
- # assign slices to training, val and test
207
- self.training_splits = randomly_ordered_documents[:self.n_train]
208
- self.val_splits = randomly_ordered_documents[self.n_train: self.n_train+self.n_val]
209
- self.test_splits = randomly_ordered_documents[self.n_train+self.n_val:]
210
- return self
211
-
212
- def get_all_questions(self):
213
- self.training_questions, self.training_relevant_contexts = \
214
- self.create_questions(documents=self.training_splits, n_questions=self.n_questions)
215
- self.val_questions, self.val_relevant_contexts = \
216
- self.create_questions(documents=self.val_splits, n_questions=self.n_questions)
217
- self.test_questions, self.test_relevant_contexts = \
218
- self.create_questions(documents=self.test_splits, n_questions=self.n_questions)
219
- return self
220
-
221
- def save_dataset_to_jsonl(self, splits, questions, relevant_contexts, jsonl_filename):
222
- """
223
- NOTE: Each `jsonl` file has a single line! This is a nested JSON structure.
224
- Primary keys for each file are `questions`, `relevant_contexts` and `corpus`.
225
- 1. Each `question` element is a json object with a key id for the
226
- question and the string corresp to question as the value.
227
- 2. Each `relevant_contexts` element is a json object with key id
228
- corresponding to a question id and value corresponding to a unique id for the context
229
- 3. Each `corpus` element is a json object with key id
230
- corresponding to a unique context id and value being the context string.
231
- """
232
- corpus = {item.metadata["id"] : item.page_content for item in splits}
233
- dataset_dict = {
234
- "questions" : questions,
235
- "relevant_contexts" : relevant_contexts,
236
- "corpus" : corpus
237
- }
238
- with open(jsonl_filename, "w") as f:
239
- json.dump(dataset_dict, f)
240
- return dataset_dict
241
-
242
- def save_train_val_test_dataset_to_jsonl(self):
243
- self.train_dataset = \
244
- self.save_dataset_to_jsonl(self.training_splits,
245
- self.training_questions,
246
- self.training_relevant_contexts,
247
- jsonl_filename='./data/finetuning_data/training_dataset.jsonl')
248
-
249
- self.val_dataset = \
250
- self.save_dataset_to_jsonl(self.val_splits,
251
- self.val_questions,
252
- self.val_relevant_contexts,
253
- jsonl_filename='./data/finetuning_data/val_dataset.jsonl')
254
-
255
- self.test_dataset = \
256
- self.save_dataset_to_jsonl(self.test_splits,
257
- self.test_questions,
258
- self.test_relevant_contexts,
259
- jsonl_filename='./data/finetuning_data/test_dataset.jsonl')
260
- return self
261
-
262
- def run_all_prep_data(self):
263
- # if docs are passed in pre-chunking, then split docs
264
- if self.run_optional_text_splitter is True:
265
- self.optional_text_splitter()
266
-
267
- # each chunk i.e., context gets a unique id
268
- self.attach_unique_ids_to_docs()
269
-
270
- # split into train, val and test - either random or simple slicing
271
- if self.train_val_test_split_type.upper() == 'RANDOM':
272
- self.randomized_train_val_test_splits()
273
- else:
274
- self.simple_train_val_test_splits()
275
-
276
- # generate questions for each context
277
- # this step involves large number of LLM calls
278
- self.get_all_questions()
279
-
280
- # save train, val and test datasets in jsonl format
281
- self.save_train_val_test_dataset_to_jsonl()
282
- return self
283
-
284
-
285
- class FineTuneModel:
286
- def __init__(self,
287
- train_data,
288
- val_data,
289
- batch_size,
290
- base_model_id='Snowflake/snowflake-arctic-embed-m',
291
- matryoshka_dimensions=[768, 512, 256, 128, 64],
292
- number_of_training_epochs=5,
293
- finetuned_model_output_path='finetuned_arctic',
294
- evaluation_steps = 50):
295
- self.train_data = train_data
296
- self.val_data = val_data
297
- self.batch_size = batch_size
298
-
299
- self.base_model_id = base_model_id
300
- self.matryoshka_dimensions = matryoshka_dimensions
301
- self.number_of_training_epochs = number_of_training_epochs
302
- self.finetuned_model_output_path = finetuned_model_output_path
303
- self.evaluation_steps = evaluation_steps
304
-
305
- self.model = SentenceTransformer(self.base_model_id)
306
- return
307
-
308
- def prepare_data_for_finetuning(self, data):
309
- corpus = data['corpus']
310
- queries = data['questions']
311
- relevant_docs = data['relevant_contexts']
312
- return corpus, queries, relevant_docs
313
-
314
- def get_data_loader(self):
315
- corpus, queries, relevant_docs = self.prepare_data_for_finetuning(self.train_data)
316
-
317
- examples = []
318
- for query_id, query in queries.items():
319
- doc_id = relevant_docs[query_id][0]
320
- text = corpus[doc_id]
321
- example = InputExample(texts=[query, text])
322
- examples.append(example)
323
- self.loader = DataLoader(examples, batch_size=self.batch_size)
324
- return self
325
-
326
- def loss_function(self):
327
- inner_training_loss = MultipleNegativesRankingLoss(self.model)
328
- self.train_loss = MatryoshkaLoss(
329
- self.model,
330
- inner_training_loss,
331
- matryoshka_dims=self.matryoshka_dimensions
332
- )
333
- return self
334
-
335
- def get_evaluator_for_val(self):
336
- corpus, queries, relevant_docs = self.prepare_data_for_finetuning(self.val_data)
337
- self.evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)
338
- return self
339
-
340
- def fit_model(self):
341
- warmup_steps = int(len(self.loader) * self.number_of_training_epochs * 0.1)
342
- self.model.fit(
343
- train_objectives=[(self.loader, self.train_loss)],
344
- epochs=self.number_of_training_epochs,
345
- warmup_steps=warmup_steps,
346
- output_path=self.finetuned_model_output_path,
347
- show_progress_bar=True,
348
- evaluator=self.evaluator,
349
- evaluation_steps=self.evaluation_steps,
350
- )
351
-
352
- def run_steps_to_finetune_model(self):
353
- # load train data into Loader
354
- self.get_data_loader()
355
-
356
- # set up loss function
357
- self.loss_function()
358
-
359
- # set up evaluator with val data
360
- self.get_evaluator_for_val()
361
-
362
- # finetune the model
363
- self.fit_model()
364
- return self
365
-
366
-
367
- class FineTuneModelAndEvaluateRetriever(FineTuneModel):
368
- def __init__(self,
369
- train_data,
370
- val_data,
371
- test_data,
372
- batch_size,
373
- base_model_id='Snowflake/snowflake-arctic-embed-m',
374
- matryoshka_dimensions=[768, 512, 256, 128, 64],
375
- number_of_training_epochs=5,
376
- finetuned_model_output_path='finetuned_arctic',
377
- evaluation_steps = 50,
378
- ):
379
- super().__init__(train_data=train_data,
380
- val_data=val_data,
381
- batch_size=batch_size,
382
- base_model_id=base_model_id,
383
- matryoshka_dimensions=matryoshka_dimensions,
384
- number_of_training_epochs=number_of_training_epochs,
385
- finetuned_model_output_path=finetuned_model_output_path,
386
- evaluation_steps = evaluation_steps)
387
- self.test_data = test_data
388
- return
389
-
390
- def set_up_test_data_for_retrieval(self, embedding_model_for_retrieval, top_k_for_retrieval):
391
- corpus, questions, relevant_docs = self.prepare_data_for_finetuning(self.test_data)
392
-
393
- documents = [Document(page_content=content, metadata={"id": doc_id})
394
- for doc_id, content in corpus.items()]
395
-
396
- vectorstore = FAISS.from_documents(documents, embedding_model_for_retrieval)
397
- retriever = vectorstore.as_retriever(search_kwargs={"k": top_k_for_retrieval})
398
- return corpus, questions, relevant_docs, retriever
399
-
400
- def evaluate_embeddings_model(self, embedding_model_for_retrieval, top_k_for_retrieval, verbose=False):
401
- corpus, questions, relevant_docs, retriever = \
402
- self.set_up_test_data_for_retrieval(embedding_model_for_retrieval, top_k_for_retrieval)
403
- eval_results = []
404
- for id, question in tqdm.tqdm(questions.items()):
405
- retrieved_nodes = retriever.invoke(question)
406
- retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
407
- expected_id = relevant_docs[id][0]
408
- is_hit = expected_id in retrieved_ids
409
- eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})
410
- return eval_results