Spaces:
Runtime error
Runtime error
File size: 2,666 Bytes
3962a20 f225398 3962a20 f225398 3962a20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
from transformers import pipeline
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
benefits = [
{"benefitName": "Universal Credit", "coreName": "Overview", "link": "https://www.gov.uk/universal-credit/"},
{"benefitName": "Universal Credit", "coreName": "Eligibility", "link": "https://www.gov.uk/universal-credit/eligibility"},
{"benefitName": "Universal Credit", "coreName": "how much can I get", "link": "https://www.gov.uk/universal-credit/what-youll-get,https://www.gov.uk/universal-credit/how-youre-paid"},
{"benefitName": "Universal Credit", "coreName": "how to apply/claim", "link": "https://www.gov.uk/universal-credit/how-to-claim"},
]
def requestPage(link):
page = requests.get(link)
# print(page.text)
soup = BeautifulSoup(page.content, "html.parser")
return soup
def scrapeTable(table):
columns = [col.text.strip() for col in table.thead.tr.find_all()]
columns
rows = table.tbody.find_all(recursive=False)
clean_rows = ""
for row in rows:
elements = ["{}: {}".format(columns[index], element.text.strip()) for index, element in enumerate(row.find_all(recursive=False))]
elements = " ".join(elements)
# print(elements)
clean_rows += elements + "\n"
return clean_rows
def scrapePage(page):
# Scrape the text
corpus = ""
# starting from the main page
content = page.find('div', {"id":"guide-contents"})
title = content.find('h1', {"class":"part-title"})
title = title.text.strip()
corpus += title +"\n\n"
print(title)
content = content.find('div', {"class":"gem-c-govspeak"})
fragments = content.find_all(recursive=False)
for frag in fragments:
text= frag.text.strip()
if frag.name == 'ul':
clean = re.sub('\n+', "{;}", text)
corpus += "{;}" + clean
elif frag.name == 'table':
corpus += scrapeTable(frag)
else:
corpus += text
corpus += "\n"
# print(corpus)
return corpus
for benefit in benefits:
links = benefit['link'].split(',')
print(benefit['benefitName'], benefit['coreName'], len(links))
context = ""
for link in links:
page = requestPage(link)
context += scrapePage(page)
benefit['context'] = context
benefit['contextLen'] = len(context)
print("--------------------------------")
benefitsClasses = list(set(list(map(lambda x: x['benefitName'], benefits))))
core4Classes = list(set(list(map(lambda x: x['coreName'], benefits))))
# contexts
benefitsClasses, core4Classes
#question_answerer = pipeline("question-answering")
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch() |