Th3BossC's picture
tried something
36c2546
from bs4 import BeautifulSoup
import requests
def getText(url : str):
response = requests.get(url)
if response.status_code == 200:
html_content = response.content
else:
print(f"[INFO] couldn't access website data, try again")
return
soup = BeautifulSoup(html_content, 'html.parser')
text_elements = soup.find_all(['p'])
scraped_text = ' '.join(element.get_text() for element in text_elements)
if len(scraped_text) > 20000:
print(f"[ERROR] page too large to perform qna")
return
return scraped_text
from transformers import T5Tokenizer, T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')
def getAnswer(url : str, question : str):
context = getText(url)
inputs = tokenizer(f"context : {context}, question : {question}", return_tensors = 'pt').input_ids
outputs = model.generate(
inputs,
min_length = 10,
max_new_tokens = 600,
length_penalty = 1,
num_beams = 3,
no_repeat_ngram_size = 3,
temperature = 0.7,
top_k = 110,
top_p = 0.8,
repetition_penalty = 2.1
)
answer = tokenizer.decode(outputs[0], skip_special_tokens = True)
return answer