Spaces:
Running
Running
File size: 1,699 Bytes
3919e25 a648a91 4aeb6eb 3919e25 a648a91 3919e25 f05905d 1af1718 f05905d 3b77cd2 3ff2217 3b77cd2 3919e25 13951ed aaca73e 13951ed 7c1d83e 1a8e0e1 a648a91 1a8e0e1 d1bb951 13951ed 3919e25 b9c90b4 13951ed 3919e25 13951ed 8f70505 d1bb951 8f70505 502b110 3919e25 563ca5d b78c657 3919e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
from bs4 import BeautifulSoup as bs
from pypdf import PdfReader
#import html5lib
#import copy
import requests
#from IPython.display import IFrame
def scrape(instring):
return gr.HTML.update(f'''<object data="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" width="100%" height="500px"></object>''')
def scrape1(instring):
# set the url to perform the get request
URL = f'{instring}'
page = requests.get(URL)
# load the page content
text = page.content
# make a soup object by using beautiful
# soup and set the markup as html parser
soup = bs(text, "html.parser")
out = str(soup.prettify())
return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
#r = requests.get(instring)
chunk_size=2000
url = f'{instring}'
r = requests.get(url, stream=True)
html_content = requests.get(url).text
soup = bs(html_content,"html.parser")
with open('metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
try:
out = r.content
except Exception:
#out=copy.copy(soup)
print ("No Divs")
#out = IFrame(src={instring}, width=700, height=600)
#return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')
with gr.Blocks() as app:
inp=gr.Textbox()
go_btn = gr.Button()
outp = gr.HTML()
go_btn.click(scrape0,inp,outp)
app.queue(concurrency_count=10).launch() |