giovannefeitosa's picture
Initial commit
f7db77c
raw
history blame
1.73 kB
from tqdm import tqdm
import numpy as np
from commons.Configs import configs
from commons.File import file
from commons.OpenAIClient import openaiClient
class Embeddings:
def __init__(self, debug=False):
self.debug = debug
def generateEmbeddings(self):
inputFilePath = configs.generatedDatasetPath
outputFilePath = configs.generatedEmbeddingsPath
dataset = file.readJsonFile(inputFilePath)
embeddings = []
print("")
# for each sentence
for i, qa in enumerate(tqdm(dataset)):
sentences = [qa['question'], qa['answer']]
emb = openaiClient.generateEmbeddings(sentences)
embjson = {'question': emb[0], 'answer': emb[1], 'label': i}
print("Sentence: ", i, sentences)
embeddings.append(embjson)
# save all the generated embeddings
# Default: io/generated/embeddings.json
print("Writing embeddings to file: ", outputFilePath)
file.writeFile(outputFilePath, embeddings)
def loadEmbeddings(self):
inputFilePath = configs.generatedEmbeddingsPath
embeddings = file.readJsonFile(inputFilePath)
questionEmbeddings = [x['question'] for x in embeddings]
answerEmbeddings = [x['answer'] for x in embeddings]
labels = [x['label'] for x in embeddings]
# i would use float16, but I've had issues with GPU
# I know I'm not using GPU now, but I might in the future
return \
np.array(questionEmbeddings, dtype=np.float32), \
np.array(answerEmbeddings, dtype=np.float32), \
np.array(labels, dtype=np.int32)
embeddings = Embeddings()