Spaces:
Runtime error
Runtime error
import os | |
import random | |
import requests | |
import torch | |
from bs4 import BeautifulSoup | |
from datasets import Dataset | |
from langchain.docstore.document import Document | |
from langchain.llms import HuggingFacePipeline | |
from langchain.text_splitter import CharacterTextSplitter | |
from peft import PeftConfig, PeftModel | |
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer, | |
GenerationConfig, pipeline) | |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
generation_config = GenerationConfig(temperature=.8, | |
top_p=0.75, | |
top_k=40) | |
device = 'cuda' | |
shared = { | |
'answer_context': None, | |
'embeddings_dataset': None, | |
'full_text': None, | |
} | |
text_splitter = CharacterTextSplitter() | |
def get_nearest_examples(question: str, k: int): | |
""" | |
Returns the k nearest examples to a given question. | |
Args: | |
question (str): The input question to find nearest examples for. | |
k (int): The number of nearest examples to retrieve. | |
Returns: | |
The k nearest examples to the given question. | |
""" | |
print(['get_nearest_examples', 'start']) | |
question_embedding = get_embeddings([question]).cpu().detach().numpy() | |
embeddings_dataset = shared['embeddings_dataset'] | |
scores, samples = embeddings_dataset.get_nearest_examples( | |
"embeddings", question_embedding, k) | |
print(['get_nearest_examples', 'scores and samples']) | |
print(scores) | |
print(samples['id']) | |
print(['get_nearest_examples', 'end']) | |
return samples | |
def get_embeddings(text): | |
print(['get_embeddings', 'start']) | |
encoded_input = emb_tokenizer(text, | |
padding=True, | |
truncation=True, | |
return_tensors="pt") | |
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()} | |
model_output = emb_model(**encoded_input) | |
model_output = model_output.last_hidden_state[:, 0] | |
print(['get_embeddings', 'end']) | |
return model_output | |
def build_faiss_index(text): | |
""" | |
Builds a FAISS index for the given text. | |
Args: | |
text (str): The input text to build a FAISS index for. | |
Returns: | |
None. | |
""" | |
print(['build_faiss_index', 'start']) | |
text_list = split_text(text) | |
emb_list = [] | |
for i, item in enumerate(text_list): | |
emb_list.append({ | |
"embeddings": get_embeddings(item).cpu().detach().numpy()[0], | |
'id': i | |
}) | |
dataset = Dataset.from_list(emb_list) | |
dataset.add_faiss_index(column="embeddings") | |
shared['embeddings_dataset'] = dataset | |
print(['build_faiss_index', 'end']) | |
def extract_text(url: str): | |
""" | |
Extracts the text content from a given URL and returns it as a string. | |
Args: | |
url (str): The URL to extract text content from. | |
Returns: | |
str: The text content extracted from the URL, or an empty string if the URL is invalid. | |
""" | |
print(['extract_text', 'start']) | |
if url is None or url.strip() == '': | |
return '' | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
text = '\n\n'.join(map(lambda p: p.text, soup.find_all('p'))) | |
shared['full_text'] = text | |
print(['extract_text', 'end']) | |
return text | |
def split_text(text: str): | |
""" | |
Splits a given text into a list of individual lines. | |
Args: | |
text (str): The input text to split into lines. | |
Returns: | |
List[str]: A list of individual lines in the input text. | |
""" | |
lines = text.split('\n') | |
lines = [line.strip() for line in lines if line.strip()] | |
return lines | |
def remove_prompt(text: str) -> str: | |
""" | |
Removes the prompt from a given text and returns the resulting text. | |
Args: | |
text (str): The input text to remove the prompt from. | |
Returns: | |
str: The input text with the prompt removed, or the original text if the prompt is not found. | |
""" | |
output_prompt = 'Output: ' | |
try: | |
idx = text.index(output_prompt) | |
res = text[idx + len(output_prompt):].strip() | |
res = res.replace('Input: ', '') | |
except ValueError: | |
res = text | |
return res | |
def summarize_text(text: str) -> str: | |
""" | |
Generates a summary of the given text using a pre-trained language model. | |
Args: | |
text (str): The input text to generate a summary for. | |
Returns: | |
str: The generated summary for the input text. | |
""" | |
print(['summarize_text', 'start']) | |
print(['summarize_text', 'splitting text']) | |
texts = text_splitter.split_text(text) | |
docs = [Document(page_content=t) for t in texts] | |
prompts = [f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {d.page_content}\nOutput: ' | |
for d in docs] | |
print(['summarize_text', 'generating']) | |
cleaned_summaries = [remove_prompt( | |
s['generated_text']) for s in pipe(prompts)] | |
summaries = '\n\n'.join(cleaned_summaries) | |
print(['summarize_text', 'end']) | |
return summaries | |
def summarize_text_v1(text: str): | |
print(['summarize_text', 'start']) | |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: ' | |
batch = tokenizer(input_text, return_tensors='pt') | |
batch = batch.to(device) | |
print(['summarize_text', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=512, | |
generation_config=generation_config | |
) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
output = output.replace(input_text, '') | |
print(['summarize_text', 'end']) | |
return output | |
def generate_question(text: str): | |
""" | |
Generates a question based on a random section of the input text using a pre-trained language model. | |
Args: | |
text (str): The input text to generate a question for. | |
Returns: | |
str: The generated question for the input text. | |
""" | |
print(['generate_question', 'start']) | |
# Get a random section of the whole text to generate a question | |
fragments = split_text(text) | |
rnd_text = random.choice(fragments) | |
shared['answer_context'] = rnd_text | |
input_text = f'<s>Instruction: Dado el siguiente texto quiero que generes una pregunta cuya respuesta se encuentre en él.\nInput: {rnd_text}\nOutput: ' | |
batch = tokenizer(input_text, return_tensors='pt') | |
print(['generate_question', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=256, | |
generation_config=generation_config) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
output = output.replace(input_text, '') | |
print(['generate_question', 'end']) | |
return output | |
def get_answer_context(): | |
return shared['answer_context'] | |
def answer_question(question: str): | |
""" | |
Generates an answer to the given question based on a pre-trained language model and a pre-built Faiss index. | |
Args: | |
question (str): The question to generate an answer for. | |
Returns: | |
str: The generated answer for the question. | |
""" | |
print(['answer_question', 'start']) | |
full_text = shared['full_text'] | |
if not shared['embeddings_dataset']: | |
build_faiss_index(full_text) | |
top_k_samples = get_nearest_examples(question, k=3) | |
index_text = {} | |
for i, t in enumerate(split_text(full_text)): | |
index_text[i] = t | |
context = '\n'.join([index_text[id] for id in top_k_samples['id']]) | |
input_text = f"""<s>Instruction: Te voy a proporcionar un texto del cual deseo que me respondas una pregunta. | |
El texto es el siguiente: `{context}`\nInput: {question}\nOutput: """ | |
batch = tokenizer(input_text, return_tensors='pt') | |
print(['answer_question', 'generating']) | |
with torch.cuda.amp.autocast(): | |
output_tokens = model.generate(**batch, | |
max_new_tokens=256, | |
generation_config=generation_config) | |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
output = output.replace(input_text, '') | |
print(['answer_question', 'end']) | |
return output | |
def load_model(peft_model_id): | |
print(['load_model', 'start']) | |
config = PeftConfig.from_pretrained(peft_model_id) | |
print(['load_model', 'loading model']) | |
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, | |
return_dict=True, | |
load_in_8bit=True, | |
device_map='auto') | |
print(['load_model', 'loading tokenizer']) | |
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
model = PeftModel.from_pretrained(model, peft_model_id) | |
model.config.use_cache = True | |
print(['load_model', 'end']) | |
return model, tokenizer | |
def load_embeddings_model(model_ckpt: str): | |
print(['load_embeddings_model', 'start']) | |
print(['load_embeddings_model', 'loading tokenizer']) | |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
print(['load_embeddings_model', 'loading model']) | |
model = AutoModel.from_pretrained(model_ckpt) | |
model = model.to(device) | |
print(['load_embeddings_model', 'end']) | |
return model, tokenizer | |
# Models trained with LoRA | |
# - hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2 | |
# - hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t14000-v1400-v1 | |
model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t14000-v1400-v1") | |
pipe = pipeline("text2text-generation", model=model, | |
tokenizer=tokenizer, max_new_tokens=100) | |
llm = HuggingFacePipeline(pipeline=pipe) | |
# Sentence Transformers models | |
# - paraphrase-multilingual-MiniLM-L12-v2 | |
# - multi-qa-mpnet-base-dot-v1 | |
emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |