File size: 2,666 Bytes
3962a20
 
 
f225398
 
 
3962a20
f225398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3962a20
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
from transformers import pipeline
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup

import re

benefits = [
  {"benefitName": "Universal Credit", "coreName": "Overview", "link": "https://www.gov.uk/universal-credit/"},
  {"benefitName": "Universal Credit", "coreName": "Eligibility", "link": "https://www.gov.uk/universal-credit/eligibility"},
  {"benefitName": "Universal Credit", "coreName": "how much can I get​", "link": "https://www.gov.uk/universal-credit/what-youll-get,https://www.gov.uk/universal-credit/how-youre-paid"},
  {"benefitName": "Universal Credit", "coreName": "how to apply/claim", "link": "https://www.gov.uk/universal-credit/how-to-claim"},
]


def requestPage(link):
  page = requests.get(link)
  # print(page.text)

  soup = BeautifulSoup(page.content, "html.parser")

  return soup
  
def scrapeTable(table):
  columns = [col.text.strip() for col in table.thead.tr.find_all()]
  columns

  rows = table.tbody.find_all(recursive=False)
  clean_rows = ""

  for row in rows:
    elements = ["{}: {}".format(columns[index], element.text.strip()) for index, element in enumerate(row.find_all(recursive=False))]
    elements = " ".join(elements)
    # print(elements)
    clean_rows += elements + "\n"

  return clean_rows



def scrapePage(page):
  # Scrape the text
  corpus = ""

  # starting from the main page
  content = page.find('div', {"id":"guide-contents"})

  title = content.find('h1', {"class":"part-title"})
  title = title.text.strip()
  corpus += title +"\n\n"

  print(title)

  content = content.find('div', {"class":"gem-c-govspeak"})

  fragments = content.find_all(recursive=False)
  for frag in fragments:
    text= frag.text.strip()
    if frag.name == 'ul':
      clean = re.sub('\n+', "{;}", text)
      corpus += "{;}" + clean
    elif frag.name == 'table':
      corpus += scrapeTable(frag)
    else:
      corpus += text 

    corpus += "\n"

  # print(corpus)

  return corpus
  
  
 for benefit in benefits:
  links = benefit['link'].split(',')
  print(benefit['benefitName'], benefit['coreName'], len(links))

  context = ""
  for link in links:
    page = requestPage(link)
    context += scrapePage(page)

  benefit['context'] = context
  benefit['contextLen'] = len(context)
  print("--------------------------------")


benefitsClasses = list(set(list(map(lambda x: x['benefitName'], benefits))))
core4Classes = list(set(list(map(lambda x: x['coreName'], benefits))))

# contexts
benefitsClasses, core4Classes

#question_answerer = pipeline("question-answering")

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()