Spaces:
Sleeping
Sleeping
import gradio as gr | |
import urllib.request | |
import requests | |
import bs4 | |
import lxml | |
def find_all(url,q=None,num=None): | |
rawp = [] | |
source = urllib.request.urlopen(url).read() | |
soup = bs4.BeautifulSoup(source,'lxml') | |
# title of the page | |
print(soup.title) | |
# get attributes: | |
print(soup.title.name) | |
# get values: | |
print(soup.title.string) | |
# beginning navigation: | |
print(soup.title.parent.name) | |
rawp.append([tag.name for tag in soup.find_all()] ) | |
print([tag.name for tag in soup.find_all()]) | |
return rawp | |
def find_it(url,q=None,num=None): | |
out = [] | |
out_l = [] | |
z="" | |
source = urllib.request.urlopen(url).read() | |
soup = bs4.BeautifulSoup(source,'lxml') | |
for p in soup.find_all(f'{q}'): | |
if num != "": | |
z=p.get(f'{num}') | |
try: | |
test = soup.select(f'{p.name}:first-child') | |
#print(p.findChildren()) | |
except Exception as e: | |
print (e) | |
#out.append(p) | |
out.append([{q:p.string,"additional":z,"parent":p.parent.name,"previous":[b for b in p.previous],"first-child":[b.name for b in p.children],"content":p}]) | |
if p.string !=None: | |
out_l.append(p.string) | |
else: | |
out_l.append(z) | |
#out.append(p.parent.name) | |
print(dir(p)) | |
print(p.parent.name) | |
for url in soup.find_all('a'): | |
print(url.get('href')) | |
#print(soup.get_text()) | |
return out,out_l | |
def find_it2(url): | |
response = requests.get(url,a1=None,q2=None,q3=None) | |
try: | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'lxml') | |
out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) | |
return out | |
except Exception as e: | |
print (e) | |
return e | |
with gr.Blocks() as app: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
inp = gr.Textbox() | |
with gr.Column(scale=2): | |
q = gr.Textbox(value="p") | |
with gr.Column(scale=2): | |
num = gr.Textbox() | |
with gr.Row(): | |
all_btn = gr.Button("Load") | |
find_btn = gr.Button("Find") | |
with gr.Row(): | |
rawp = gr.JSON() | |
outp = gr.JSON() | |
outl = gr.Textbox() | |
all_btn.click(find_all,[inp,q,num],[rawp]) | |
find_btn.click(find_it,[inp,q,num],[outp,outl]) | |
app.launch() | |