Spaces:
Runtime error
Runtime error
Fix error related to faiss
Browse files- functions.py +19 -19
functions.py
CHANGED
@@ -29,8 +29,8 @@ def get_nearest_examples(question: str, k: int):
|
|
29 |
scores, samples = embeddings_dataset.get_nearest_examples(
|
30 |
"embeddings", question_embedding, k)
|
31 |
print(['get_nearest_examples', 'scores and samples'])
|
32 |
-
|
33 |
-
|
34 |
print(['get_nearest_examples', 'end'])
|
35 |
return samples
|
36 |
|
@@ -44,10 +44,6 @@ def get_embeddings(text):
|
|
44 |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
|
45 |
model_output = emb_model(**encoded_input)
|
46 |
model_output = model_output.last_hidden_state[:, 0]
|
47 |
-
# print(model_output)
|
48 |
-
# Error: AttributeError: 'numpy.ndarray' object has no attribute 'cpu'
|
49 |
-
# emb_item = model_output.detach().cpu().numpy()[0]
|
50 |
-
# print(emb_item)
|
51 |
print(['get_embeddings', 'end'])
|
52 |
return model_output
|
53 |
|
@@ -56,9 +52,11 @@ def build_faiss_index(text):
|
|
56 |
print(['build_faiss_index', 'start'])
|
57 |
text_list = split_text(text)
|
58 |
emb_list = []
|
59 |
-
for item in text_list:
|
60 |
-
emb_list.append({
|
61 |
-
|
|
|
|
|
62 |
dataset = Dataset.from_list(emb_list)
|
63 |
dataset.add_faiss_index(column="embeddings")
|
64 |
shared['embeddings_dataset'] = dataset
|
@@ -125,13 +123,18 @@ def get_answer_context():
|
|
125 |
|
126 |
|
127 |
def answer_question(question: str):
|
128 |
-
# return ', '.join([len(shared['base_text']), len(question)])
|
129 |
print(['answer_question', 'start'])
|
|
|
|
|
130 |
if not shared['embeddings_dataset']:
|
131 |
-
build_faiss_index(
|
132 |
-
top_k_samples = get_nearest_examples(question, k=
|
|
|
|
|
|
|
|
|
133 |
|
134 |
-
context = '\n'.join(top_k_samples)
|
135 |
|
136 |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
|
137 |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
|
@@ -162,9 +165,8 @@ def load_model(peft_model_id):
|
|
162 |
return model, tokenizer
|
163 |
|
164 |
|
165 |
-
def load_embeddings_model():
|
166 |
print(['load_embeddings_model', 'start'])
|
167 |
-
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
168 |
print(['load_embeddings_model', 'loading tokenizer'])
|
169 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
170 |
print(['load_embeddings_model', 'loading model'])
|
@@ -174,7 +176,5 @@ def load_embeddings_model():
|
|
174 |
return model, tokenizer
|
175 |
|
176 |
|
177 |
-
model, tokenizer = load_model(
|
178 |
-
|
179 |
-
|
180 |
-
emb_model, emb_tokenizer = load_embeddings_model()
|
|
|
29 |
scores, samples = embeddings_dataset.get_nearest_examples(
|
30 |
"embeddings", question_embedding, k)
|
31 |
print(['get_nearest_examples', 'scores and samples'])
|
32 |
+
print(scores)
|
33 |
+
print(samples['id'])
|
34 |
print(['get_nearest_examples', 'end'])
|
35 |
return samples
|
36 |
|
|
|
44 |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
|
45 |
model_output = emb_model(**encoded_input)
|
46 |
model_output = model_output.last_hidden_state[:, 0]
|
|
|
|
|
|
|
|
|
47 |
print(['get_embeddings', 'end'])
|
48 |
return model_output
|
49 |
|
|
|
52 |
print(['build_faiss_index', 'start'])
|
53 |
text_list = split_text(text)
|
54 |
emb_list = []
|
55 |
+
for i, item in enumerate(text_list):
|
56 |
+
emb_list.append({
|
57 |
+
"embeddings": get_embeddings(item).cpu().detach().numpy()[0],
|
58 |
+
'id': i
|
59 |
+
})
|
60 |
dataset = Dataset.from_list(emb_list)
|
61 |
dataset.add_faiss_index(column="embeddings")
|
62 |
shared['embeddings_dataset'] = dataset
|
|
|
123 |
|
124 |
|
125 |
def answer_question(question: str):
|
|
|
126 |
print(['answer_question', 'start'])
|
127 |
+
full_text = shared['full_text']
|
128 |
+
|
129 |
if not shared['embeddings_dataset']:
|
130 |
+
build_faiss_index(full_text)
|
131 |
+
top_k_samples = get_nearest_examples(question, k=3)
|
132 |
+
|
133 |
+
index_text = {}
|
134 |
+
for i, t in enumerate(split_text(full_text)):
|
135 |
+
index_text[i] = t
|
136 |
|
137 |
+
context = '\n'.join([index_text[id] for id in top_k_samples['id']])
|
138 |
|
139 |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
|
140 |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
|
|
|
165 |
return model, tokenizer
|
166 |
|
167 |
|
168 |
+
def load_embeddings_model(model_ckpt:str):
|
169 |
print(['load_embeddings_model', 'start'])
|
|
|
170 |
print(['load_embeddings_model', 'loading tokenizer'])
|
171 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
172 |
print(['load_embeddings_model', 'loading model'])
|
|
|
176 |
return model, tokenizer
|
177 |
|
178 |
|
179 |
+
model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
|
180 |
+
emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
|
|
|