Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
import requests | |
def getText(url : str): | |
response = requests.get(url) | |
if response.status_code == 200: | |
html_content = response.content | |
else: | |
print(f"[INFO] couldn't access website data, try again") | |
return | |
soup = BeautifulSoup(html_content, 'html.parser') | |
text_elements = soup.find_all(['p']) | |
scraped_text = ' '.join(element.get_text() for element in text_elements) | |
if len(scraped_text) > 20000: | |
print(f"[ERROR] page too large to perform qna") | |
return | |
return scraped_text | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large') | |
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large') | |
def getAnswer(url : str, question : str): | |
context = getText(url) | |
inputs = tokenizer(f"context : {context}, question : {question}", return_tensors = 'pt').input_ids | |
outputs = model.generate( | |
inputs, | |
min_length = 10, | |
max_new_tokens = 600, | |
length_penalty = 1, | |
num_beams = 3, | |
no_repeat_ngram_size = 3, | |
temperature = 0.7, | |
top_k = 110, | |
top_p = 0.8, | |
repetition_penalty = 2.1 | |
) | |
answer = tokenizer.decode(outputs[0], skip_special_tokens = True) | |
return answer |