milyiyo commited on
Commit
6ce5a5b
·
1 Parent(s): b3d009b

Add functions to implement missing features

Browse files
Files changed (1) hide show
  1. functions.py +107 -8
functions.py CHANGED
@@ -1,10 +1,11 @@
1
  import os
2
-
3
  import requests
 
4
  import torch
5
  from bs4 import BeautifulSoup
6
  from peft import PeftConfig, PeftModel
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 
8
 
9
  # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10
 
@@ -12,6 +13,49 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
12
  generation_config = GenerationConfig(temperature=.8,
13
  top_p=0.75,
14
  top_k=40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def extract_text(url: str):
@@ -25,11 +69,17 @@ def extract_text(url: str):
25
  return text
26
 
27
 
 
 
 
 
 
 
28
  def summarize_text(text: str):
29
  print(['summarize_text', 'start'])
30
  input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
31
  batch = tokenizer(input_text, return_tensors='pt')
32
- batch = batch.to('cuda')
33
  print(['summarize_text', 'generating'])
34
  with torch.cuda.amp.autocast():
35
  output_tokens = model.generate(**batch,
@@ -41,14 +91,49 @@ def summarize_text(text: str):
41
  print(['summarize_text', 'end'])
42
  return output
43
 
44
- def generate_question(text:str):
45
- return 'Pregunta de ejemplo.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def get_answer_context():
48
- return 'Aquí está la respuesta.'
 
49
 
50
- def answer_question(question:str):
51
- return 'Esta es la respuesta a su pregunta.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  def load_model(peft_model_id):
@@ -67,5 +152,19 @@ def load_model(peft_model_id):
67
  return model, tokenizer
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  model, tokenizer = load_model(
71
  "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
 
 
 
1
  import os
 
2
  import requests
3
+ import random
4
  import torch
5
  from bs4 import BeautifulSoup
6
  from peft import PeftConfig, PeftModel
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoModel
8
+ from datasets import DatasetDict
9
 
10
  # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
11
 
 
13
  generation_config = GenerationConfig(temperature=.8,
14
  top_p=0.75,
15
  top_k=40)
16
+ device = 'cuda'
17
+
18
+ shared = {
19
+ 'answer_context': None,
20
+ 'embeddings_dataset': None
21
+ }
22
+
23
+
24
+ def get_nearest_examples(question: str, k: int):
25
+ print(['get_nearest_examples', 'start'])
26
+ question_embedding = get_embeddings([question]).cpu().detach().numpy()
27
+ embeddings_dataset = shared['embeddings_dataset']
28
+ scores, samples = embeddings_dataset.get_nearest_examples(
29
+ "embeddings", question_embedding, k)
30
+ print(['get_nearest_examples', 'scores and samples'])
31
+ for i in range(len(scores)):
32
+ print([scores[i], samples[i]])
33
+ print(['get_nearest_examples', 'end'])
34
+ return samples
35
+
36
+
37
+ def get_embeddings(text):
38
+ print(['get_embeddings', 'start'])
39
+ encoded_input = tokenizer(
40
+ text, padding=True, truncation=True, return_tensors="pt")
41
+ encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
42
+ model_output = model(**encoded_input)
43
+ model_output = model_output.last_hidden_state[:, 0]
44
+ emb_item = model_output.detach().cpu().numpy()[0]
45
+ print(['get_embeddings', 'end'])
46
+ return emb_item
47
+
48
+
49
+ def build_faiss_index(text):
50
+ print(['build_faiss_index', 'start'])
51
+ text_list = split_text(text)
52
+ emb_list = []
53
+ for item in text_list:
54
+ emb_list.append({"embeddings": get_embeddings(item)})
55
+ dataset = DatasetDict({'train': emb_list})
56
+ dataset.add_faiss_index(column="embeddings")
57
+ shared['embeddings_dataset'] = dataset
58
+ print(['build_faiss_index', 'end'])
59
 
60
 
61
  def extract_text(url: str):
 
69
  return text
70
 
71
 
72
+ def split_text(text: str):
73
+ lines = text.split('\n')
74
+ lines = [line.strip() for line in lines if line.strip()]
75
+ return lines
76
+
77
+
78
  def summarize_text(text: str):
79
  print(['summarize_text', 'start'])
80
  input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
81
  batch = tokenizer(input_text, return_tensors='pt')
82
+ batch = batch.to(device)
83
  print(['summarize_text', 'generating'])
84
  with torch.cuda.amp.autocast():
85
  output_tokens = model.generate(**batch,
 
91
  print(['summarize_text', 'end'])
92
  return output
93
 
94
+
95
+ def generate_question(text: str):
96
+ print(['generate_question', 'start'])
97
+ # Get a random section of the whole text to generate a question
98
+ fragments = split_text(text)
99
+ rnd_text = random.choice(fragments)
100
+ shared['answer_context'] = rnd_text
101
+
102
+ input_text = f'<s>Instruction: Dado el siguiente texto quiero que generes una pregunta cuya respuesta se encuentre en él.\nInput: {rnd_text}\nOutput: '
103
+ batch = tokenizer(input_text, return_tensors='pt')
104
+ print(['generate_question', 'generating'])
105
+ with torch.cuda.amp.autocast():
106
+ output_tokens = model.generate(**batch,
107
+ max_new_tokens=256,
108
+ generation_config=generation_config)
109
+ output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
110
+ print(['generate_question', 'end'])
111
+ return output
112
+
113
 
114
  def get_answer_context():
115
+ return shared['answer_context']
116
+
117
 
118
+ def answer_question(full_text: str, question: str):
119
+ print(['answer_question', 'start'])
120
+ if not shared['embeddings_dataset']:
121
+ build_faiss_index(full_text)
122
+ top_k_samples = get_nearest_examples(question, k=5)
123
+
124
+ context = '\n'.join(top_k_samples)
125
+
126
+ input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta.
127
+ El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """
128
+ batch = tokenizer(input_text, return_tensors='pt')
129
+ print(['answer_question', 'generating'])
130
+ with torch.cuda.amp.autocast():
131
+ output_tokens = model.generate(**batch,
132
+ max_new_tokens=256,
133
+ generation_config=generation_config)
134
+ output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
135
+ print(['answer_question', 'end'])
136
+ return output
137
 
138
 
139
  def load_model(peft_model_id):
 
152
  return model, tokenizer
153
 
154
 
155
+ def load_embeddings_model():
156
+ print(['load_embeddings_model', 'start'])
157
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
158
+ print(['load_embeddings_model', 'loading tokenizer'])
159
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
160
+ print(['load_embeddings_model', 'loading model'])
161
+ model = AutoModel.from_pretrained(model_ckpt)
162
+ model = model.to(device)
163
+ print(['load_embeddings_model', 'end'])
164
+ return model, tokenizer
165
+
166
+
167
  model, tokenizer = load_model(
168
  "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
169
+
170
+ emb_model, emb_tokenizer = load_embeddings_model()