Spaces:
Runtime error
Runtime error
import os | |
import requests | |
import random | |
import torch | |
from bs4 import BeautifulSoup | |
from peft import PeftConfig, PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoModel | |
from datasets import DatasetDict, Dataset | |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
generation_config = GenerationConfig(temperature=.8, | |
top_p=0.75, | |
top_k=40) | |
device = 'cuda' | |
shared = { | |
'answer_context': None, | |
'embeddings_dataset': None, | |
'full_text': None, | |
} | |
def get_nearest_examples(question: str, k: int): | |
print(['get_nearest_examples', 'start']) | |
question_embedding = get_embeddings([question]).cpu().detach().numpy() | |
embeddings_dataset = shared['embeddings_dataset'] | |
scores, samples = embeddings_dataset.get_nearest_examples( | |
"embeddings", question_embedding, k) | |
print(['get_nearest_examples', 'scores and samples']) | |
print(scores) | |
print(samples['id']) | |
print(['get_nearest_examples', 'end']) | |
return samples | |
def get_embeddings(text): | |
print(['get_embeddings', 'start']) | |
encoded_input = emb_tokenizer(text, | |
padding=True, | |
truncation=True, | |
return_tensors="pt") | |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()} | |
model_output = emb_model(**encoded_input) | |
model_output = model_output.last_hidden_state[:, 0] | |
print(['get_embeddings', 'end']) | |
return model_output | |
def build_faiss_index(text): | |
print(['build_faiss_index', 'start']) | |
text_list = split_text(text) | |
emb_list = [] | |
for i, item in enumerate(text_list): | |
emb_list.append({ | |
"embeddings": get_embeddings(item).cpu().detach().numpy()[0], | |
'id': i | |
}) | |
dataset = Dataset.from_list(emb_list) | |
dataset.add_faiss_index(column="embeddings") | |
shared['embeddings_dataset'] = dataset | |
print(['build_faiss_index', 'end']) | |
def extract_text(url: str): | |
print(['extract_text', 'start']) | |
if url is None or url.strip() == '': | |
return '' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p'))) | |
shared['full_text'] = text | |
print(['extract_text', 'end']) | |
return text | |
def split_text(text: str): | |
lines = text.split('\n') | |
lines = [line.strip() for line in lines if line.strip()] | |
return lines | |
def summarize_text(text: str): | |
print(['summarize_text', 'start']) | |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: ' | |
batch = tokenizer(input_text, return_tensors='pt') | |
batch = batch.to(device) | |
print(['summarize_text', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=512, | |
generation_config=generation_config | |
) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
output = output.replace(input_text, '') | |
print(['summarize_text', 'end']) | |
return output | |
def generate_question(text: str): | |
print(['generate_question', 'start']) | |
# Get a random section of the whole text to generate a question | |
fragments = split_text(text) | |
rnd_text = random.choice(fragments) | |
shared['answer_context'] = rnd_text | |
input_text = f'<s>Instruction: Dado el siguiente texto quiero que generes una pregunta cuya respuesta se encuentre en él.\nInput: {rnd_text}\nOutput: ' | |
batch = tokenizer(input_text, return_tensors='pt') | |
print(['generate_question', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=256, | |
generation_config=generation_config) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
output = output.replace(input_text, '') | |
print(['generate_question', 'end']) | |
return output | |
def get_answer_context(): | |
return shared['answer_context'] | |
def answer_question(question: str): | |
print(['answer_question', 'start']) | |
full_text = shared['full_text'] | |
if not shared['embeddings_dataset']: | |
build_faiss_index(full_text) | |
top_k_samples = get_nearest_examples(question, k=3) | |
index_text = {} | |
for i, t in enumerate(split_text(full_text)): | |
index_text[i] = t | |
context = '\n'.join([index_text[id] for id in top_k_samples['id']]) | |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta. | |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """ | |
batch = tokenizer(input_text, return_tensors='pt') | |
print(['answer_question', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=256, | |
generation_config=generation_config) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
print(['answer_question', 'end']) | |
return output | |
def load_model(peft_model_id): | |
print(['load_model', 'start']) | |
config = PeftConfig.from_pretrained(peft_model_id) | |
print(['load_model', 'loading model']) | |
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, | |
return_dict=True, | |
load_in_8bit=True, | |
device_map='auto') | |
print(['load_model', 'loading tokenizer']) | |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
model = PeftModel.from_pretrained(model, peft_model_id) | |
model.config.use_cache = True | |
print(['load_model', 'end']) | |
return model, tokenizer | |
def load_embeddings_model(model_ckpt:str): | |
print(['load_embeddings_model', 'start']) | |
print(['load_embeddings_model', 'loading tokenizer']) | |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
print(['load_embeddings_model', 'loading model']) | |
model = AutoModel.from_pretrained(model_ckpt) | |
model = model.to(device) | |
print(['load_embeddings_model', 'end']) | |
return model, tokenizer | |
model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2") | |
emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/multi-qa-mpnet-base-dot-v1") | |