Spaces:

Omnibus
/

pdf-reader

Running

File size: 1,699 Bytes

3919e25
a648a91
4aeb6eb
 
 
3919e25
 
a648a91
3919e25
 
f05905d
1af1718
f05905d
 
3b77cd2
 
 
 
 
 
 
 
 
 
 
3ff2217
3b77cd2
3919e25
13951ed
aaca73e
13951ed
7c1d83e
1a8e0e1
a648a91
1a8e0e1
d1bb951
13951ed
 
 
3919e25
b9c90b4
13951ed
3919e25
 
 
 
13951ed
8f70505
d1bb951
8f70505
502b110
3919e25
 
563ca5d
b78c657
3919e25

import gradio as gr
from bs4 import BeautifulSoup as bs 
from pypdf import PdfReader

#import html5lib
#import copy
import requests 
#from IPython.display import IFrame

def scrape(instring):
  
    return gr.HTML.update(f'''<object data="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" width="100%" height="500px"></object>''')

def scrape1(instring):
    # set the url to perform the get request
    URL = f'{instring}'
    page = requests.get(URL)
      
    # load the page content
    text = page.content
      
    # make a soup object by using beautiful
    # soup and set the markup as html parser
    soup = bs(text, "html.parser")
    out = str(soup.prettify())
    return gr.HTML.update(f'''<object data={instring} type="application/pdf" width="100%" height="500px">''')
def scrape0(instring):
    
    #r = requests.get(instring) 
    chunk_size=2000
    url = f'{instring}'
    r = requests.get(url, stream=True)
    html_content = requests.get(url).text
    soup = bs(html_content,"html.parser")

    with open('metadata.pdf', 'wb') as fd:
        for chunk in r.iter_content(chunk_size):
            fd.write(chunk)

    try:
        out = r.content
       
    except Exception:
        #out=copy.copy(soup)
        print ("No Divs")

    #out = IFrame(src={instring}, width=700, height=600)
    #return gr.HTML.update(f'''<iframe src={out}, width=700, height=600></iframe>''')
    return gr.HTML.update(f'''<object data=metadata.pdf type="application/pdf" width="100%" height="500px">''')

with gr.Blocks() as app:
    inp=gr.Textbox()
    go_btn = gr.Button()
    outp = gr.HTML()
    go_btn.click(scrape0,inp,outp)
app.queue(concurrency_count=10).launch()