Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import pipeline | |
from datetime import datetime | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
benefits = [ | |
{"benefitName": "Universal Credit", "coreName": "Overview", "link": "https://www.gov.uk/universal-credit/"}, | |
{"benefitName": "Universal Credit", "coreName": "Eligibility", "link": "https://www.gov.uk/universal-credit/eligibility"}, | |
{"benefitName": "Universal Credit", "coreName": "how much can I get", "link": "https://www.gov.uk/universal-credit/what-youll-get,https://www.gov.uk/universal-credit/how-youre-paid"}, | |
{"benefitName": "Universal Credit", "coreName": "how to apply/claim", "link": "https://www.gov.uk/universal-credit/how-to-claim"}, | |
] | |
def requestPage(link): | |
page = requests.get(link) | |
# print(page.text) | |
soup = BeautifulSoup(page.content, "html.parser") | |
return soup | |
def scrapeTable(table): | |
columns = [col.text.strip() for col in table.thead.tr.find_all()] | |
columns | |
rows = table.tbody.find_all(recursive=False) | |
clean_rows = "" | |
for row in rows: | |
elements = ["{}: {}".format(columns[index], element.text.strip()) for index, element in enumerate(row.find_all(recursive=False))] | |
elements = " ".join(elements) | |
# print(elements) | |
clean_rows += elements + "\n" | |
return clean_rows | |
def scrapePage(page): | |
# Scrape the text | |
corpus = "" | |
# starting from the main page | |
content = page.find('div', {"id":"guide-contents"}) | |
title = content.find('h1', {"class":"part-title"}) | |
title = title.text.strip() | |
corpus += title +"\n\n" | |
print(title) | |
content = content.find('div', {"class":"gem-c-govspeak"}) | |
fragments = content.find_all(recursive=False) | |
for frag in fragments: | |
text= frag.text.strip() | |
if frag.name == 'ul': | |
clean = re.sub('\n+', "{;}", text) | |
corpus += "{;}" + clean | |
elif frag.name == 'table': | |
corpus += scrapeTable(frag) | |
else: | |
corpus += text | |
corpus += "\n" | |
# print(corpus) | |
return corpus | |
for benefit in benefits: | |
links = benefit['link'].split(',') | |
print(benefit['benefitName'], benefit['coreName'], len(links)) | |
context = "" | |
for link in links: | |
page = requestPage(link) | |
context += scrapePage(page) | |
benefit['context'] = context | |
benefit['contextLen'] = len(context) | |
print("--------------------------------") | |
benefitsClasses = list(set(list(map(lambda x: x['benefitName'], benefits)))) | |
core4Classes = list(set(list(map(lambda x: x['coreName'], benefits)))) | |
# contexts | |
benefitsClasses, core4Classes | |
#question_answerer = pipeline("question-answering") | |
iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
iface.launch() |