milyiyo commited on
Commit
0bcdbb7
·
1 Parent(s): 941bfc4

Store the extracted text to be used during faiss' index creation

Browse files
Files changed (1) hide show
  1. functions.py +7 -10
functions.py CHANGED
@@ -18,13 +18,9 @@ device = 'cuda'
18
  shared = {
19
  'answer_context': None,
20
  'embeddings_dataset': None,
21
- 'base_text': None,
22
  }
23
 
24
- def store_text_changes(text):
25
- shared['base_text'] = text
26
-
27
-
28
  def get_nearest_examples(question: str, k: int):
29
  print(['get_nearest_examples', 'start'])
30
  question_embedding = get_embeddings([question]).cpu().detach().numpy()
@@ -69,6 +65,7 @@ def extract_text(url: str):
69
  response = requests.get(url)
70
  soup = BeautifulSoup(response.text, "html.parser")
71
  text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))
 
72
  print(['extract_text', 'end'])
73
  return text
74
 
@@ -121,10 +118,10 @@ def get_answer_context():
121
 
122
 
123
  def answer_question(question: str):
124
- return ', '.join([len(shared['base_text']), len(question)])
125
  print(['answer_question', 'start'])
126
  if not shared['embeddings_dataset']:
127
- build_faiss_index(full_text)
128
  top_k_samples = get_nearest_examples(question, k=5)
129
 
130
  context = '\n'.join(top_k_samples)
@@ -170,7 +167,7 @@ def load_embeddings_model():
170
  return model, tokenizer
171
 
172
 
173
- # model, tokenizer = load_model(
174
- # "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
175
 
176
- # emb_model, emb_tokenizer = load_embeddings_model()
 
18
  shared = {
19
  'answer_context': None,
20
  'embeddings_dataset': None,
21
+ 'full_text': None,
22
  }
23
 
 
 
 
 
24
  def get_nearest_examples(question: str, k: int):
25
  print(['get_nearest_examples', 'start'])
26
  question_embedding = get_embeddings([question]).cpu().detach().numpy()
 
65
  response = requests.get(url)
66
  soup = BeautifulSoup(response.text, "html.parser")
67
  text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p')))
68
+ shared['full_text'] = text
69
  print(['extract_text', 'end'])
70
  return text
71
 
 
118
 
119
 
120
  def answer_question(question: str):
121
+ # return ', '.join([len(shared['base_text']), len(question)])
122
  print(['answer_question', 'start'])
123
  if not shared['embeddings_dataset']:
124
+ build_faiss_index(shared['full_text'])
125
  top_k_samples = get_nearest_examples(question, k=5)
126
 
127
  context = '\n'.join(top_k_samples)
 
167
  return model, tokenizer
168
 
169
 
170
+ model, tokenizer = load_model(
171
+ "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
172
 
173
+ emb_model, emb_tokenizer = load_embeddings_model()