milyiyo commited on
Commit
5355c89
·
1 Parent(s): c9916d8

Fix error related to faiss

Browse files
Files changed (1) hide show
  1. functions.py +19 -19
functions.py CHANGED
@@ -29,8 +29,8 @@ def get_nearest_examples(question: str, k: int):
29
  scores, samples = embeddings_dataset.get_nearest_examples(
30
  "embeddings", question_embedding, k)
31
  print(['get_nearest_examples', 'scores and samples'])
32
- for i in range(len(scores)):
33
- print([scores[i], samples[i]])
34
  print(['get_nearest_examples', 'end'])
35
  return samples
36
 
@@ -44,10 +44,6 @@ def get_embeddings(text):
44
  encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
45
  model_output = emb_model(**encoded_input)
46
  model_output = model_output.last_hidden_state[:, 0]
47
- # print(model_output)
48
- # Error: AttributeError: 'numpy.ndarray' object has no attribute 'cpu'
49
- # emb_item = model_output.detach().cpu().numpy()[0]
50
- # print(emb_item)
51
  print(['get_embeddings', 'end'])
52
  return model_output
53
 
@@ -56,9 +52,11 @@ def build_faiss_index(text):
56
  print(['build_faiss_index', 'start'])
57
  text_list = split_text(text)
58
  emb_list = []
59
- for item in text_list:
60
- emb_list.append({"embeddings": get_embeddings(item)})
61
- # dataset = DatasetDict({'train': emb_list})
 
 
62
  dataset = Dataset.from_list(emb_list)
63
  dataset.add_faiss_index(column="embeddings")
64
  shared['embeddings_dataset'] = dataset
@@ -125,13 +123,18 @@ def get_answer_context():
125
 
126
 
127
  def answer_question(question: str):
128
- # return ', '.join([len(shared['base_text']), len(question)])
129
  print(['answer_question', 'start'])
 
 
130
  if not shared['embeddings_dataset']:
131
- build_faiss_index(shared['full_text'])
132
- top_k_samples = get_nearest_examples(question, k=5)
 
 
 
 
133
 
134
- context = '\n'.join(top_k_samples)
135
 
136
  input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
137
  El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
@@ -162,9 +165,8 @@ def load_model(peft_model_id):
162
  return model, tokenizer
163
 
164
 
165
- def load_embeddings_model():
166
  print(['load_embeddings_model', 'start'])
167
- model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
168
  print(['load_embeddings_model', 'loading tokenizer'])
169
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
170
  print(['load_embeddings_model', 'loading model'])
@@ -174,7 +176,5 @@ def load_embeddings_model():
174
  return model, tokenizer
175
 
176
 
177
- model, tokenizer = load_model(
178
- "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
179
-
180
- emb_model, emb_tokenizer = load_embeddings_model()
 
29
  scores, samples = embeddings_dataset.get_nearest_examples(
30
  "embeddings", question_embedding, k)
31
  print(['get_nearest_examples', 'scores and samples'])
32
+ print(scores)
33
+ print(samples['id'])
34
  print(['get_nearest_examples', 'end'])
35
  return samples
36
 
 
44
  encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
45
  model_output = emb_model(**encoded_input)
46
  model_output = model_output.last_hidden_state[:, 0]
 
 
 
 
47
  print(['get_embeddings', 'end'])
48
  return model_output
49
 
 
52
  print(['build_faiss_index', 'start'])
53
  text_list = split_text(text)
54
  emb_list = []
55
+ for i, item in enumerate(text_list):
56
+ emb_list.append({
57
+ "embeddings": get_embeddings(item).cpu().detach().numpy()[0],
58
+ 'id': i
59
+ })
60
  dataset = Dataset.from_list(emb_list)
61
  dataset.add_faiss_index(column="embeddings")
62
  shared['embeddings_dataset'] = dataset
 
123
 
124
 
125
  def answer_question(question: str):
 
126
  print(['answer_question', 'start'])
127
+ full_text = shared['full_text']
128
+
129
  if not shared['embeddings_dataset']:
130
+ build_faiss_index(full_text)
131
+ top_k_samples = get_nearest_examples(question, k=3)
132
+
133
+ index_text = {}
134
+ for i, t in enumerate(split_text(full_text)):
135
+ index_text[i] = t
136
 
137
+ context = '\n'.join([index_text[id] for id in top_k_samples['id']])
138
 
139
  input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
140
  El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
 
165
  return model, tokenizer
166
 
167
 
168
+ def load_embeddings_model(model_ckpt:str):
169
  print(['load_embeddings_model', 'start'])
 
170
  print(['load_embeddings_model', 'loading tokenizer'])
171
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
172
  print(['load_embeddings_model', 'loading model'])
 
176
  return model, tokenizer
177
 
178
 
179
+ model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
180
+ emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/multi-qa-mpnet-base-dot-v1")