File size: 4,420 Bytes
3919e25
2e2a7b2
4ffb3fe
9774c1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5135496
de8ef09
9774c1c
 
d171ec8
de8ef09
 
8d346b2
 
 
 
9774c1c
de8ef09
 
8d346b2
de8ef09
 
 
 
9774c1c
de8ef09
9774c1c
 
de8ef09
47a5a07
9774c1c
47a5a07
9774c1c
47a5a07
9774c1c
47a5a07
 
b4be601
9774c1c
352bdf6
 
d171ec8
a923971
3919e25
2d1281f
a923971
d599b56
d02b2ab
a923971
 
 
9774c1c
2e2a7b2
 
 
 
 
 
 
 
a923971
e86a2c5
 
2e2a7b2
 
b7f89cc
2e2a7b2
1c5b68c
fb04ca9
 
 
 
352bdf6
 
 
 
 
1e634f8
8f70505
502b110
d599b56
d846da3
 
 
9c7f619
 
 
9774c1c
 
563ca5d
1e634f8
 
 
ba73e05
9774c1c
3919e25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
import requests
from pypdf import PdfReader
import pypdfium2 as pdfium

import easyocr

ocr_id = {
    "Afrikaans": "af",
    "Albanian": "sq",
    "Arabic": "ar",
    "Azerbaijani": "az",
    "Belarusian": "be",
    "Bulgarian": "bg",
    "Bengali": "bn",
    "Bosnian": "bs",
    "Chinese (simplified)": "ch_sim",
    "Chinese (traditional)": "ch_tra",
    "Croatian": "hr",
    "Czech": "cs",
    "Danish": "da",
    "Dutch": "nl",
    "English": "en",
    "Estonian": "et",
    "French": "fr",
    "German": "de",
    "Irish": "ga",
    "Hindi": "hi",
    "Hungarian": "hu",
    "Indonesian": "id",
    "Icelandic": "is",
    "Italian": "it",
    "Japanese": "ja",
    "Kannada": "kn",
    "Korean": "ko",
    "Lithuanian": "lt",
    "Latvian": "lv",
    "Mongolian": "mn",
    "Marathi": "mr",
    "Malay": "ms",
    "Nepali": "ne",
    "Norwegian": "no",
    "Occitan": "oc",
    "Polish": "pl",
    "Portuguese": "pt",
    "Romanian": "ro",
    "Russian": "ru",
    "Serbian (cyrillic)": "rs_cyrillic",
    "Serbian (latin)": "rs_latin",
    "Slovak": "sk",
    "Slovenian": "sl",
    "Spanish": "es",
    "Swedish": "sv",
    "Swahili": "sw",
    "Tamil": "ta",
    "Thai": "th",
    "Tagalog": "tl",
    "Turkish": "tr",
    "Ukrainian": "uk",
    "Urdu": "ur",
    "Uzbek": "uz",
    "Vietnamese": "vi",
    "Welsh": "cy",
    "Zulu": "zu",
}

def pdf_pil(file_path,page_num):

    pdf = pdfium.PdfDocument("data.pdf")
    print ("\n PDF read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
    #n_pages = len(pdf)
    #for page_number in range(n_pages):
    page = pdf.get_page(int(page_num)-1)
    print ("\n Page read !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")

    bitmap = page.render(
        scale = 1,    # 72dpi resolution
        rotation = 0, # no additional rotation
        # ... further rendering options
    )
    print ("\n Page rendered !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
    
    pil_image = bitmap.to_pil()
    print ("\n Page to PIL !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
    
    pil_image.save(f"image_{page_num}.png")
    print ("\n Page saved !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! \n")
    
    return (f"image_{page_num}.png")

def ocrpdf(file_path,pdf_lang,page_num):
    img1 = pdf_pil(file_path,page_num)
    print("DONE 1 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    lang=[f"{ocr_id[pdf_lang]}"]
    print("DONE 2 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    reader = easyocr.Reader(lang)
    print("DONE 3 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    bounds = reader.readtext(img1)
    print("DONE 4 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    
    this = ""
    for bound in bounds:
        this = (f'{this}'+ f'{bound[1]}')
    return this    
    

def scrape(instring):
    html_src=(f'''
    <div style="text-align:center">
    <h4>PDF Viewer</h4>
    <iframe src="https://docs.google.com/viewer?url={instring}&embedded=true" frameborder="0" height="1200px" width="100%"></iframe>
    </div>''')
    return gr.HTML.update(f'''{html_src}''')

def scrape00(instring, page_num,pdf_lang):
    response = requests.get(instring, stream=True)

    if response.status_code == 200:
        with open("data.pdf", "wb") as f:
            f.write(response.content)
    else:
        print(response.status_code)


    #out = Path("./data.pdf")
    #print (out)
    reader = PdfReader("data.pdf")
    number_of_pages = len(reader.pages)
    page = reader.pages[int(page_num)-1]
    text = page.extract_text()
    print (text)
    try:
        summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
        sum_out = summarizer(text)
    except Exception:
        try:
            text = ocrpdf("data.pdf",pdf_lang,page_num)
            sum_out = summarizer(text)
        except Exception:
            sum_out = "Error"
    return text, sum_out

with gr.Blocks() as app:
    gr.Markdown('''<h1>PDF Viewer''')
    with gr.Row():
        inp=gr.Textbox(label="PDF URL",scale=3)
        pg_num=gr.Number(label="Page Number",value=1,precision=0,scale=1)
    with gr.Row():
        go_btn = gr.Button("Load PDF")
        sum_btn = gr.Button("Summarize")
        target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")

    outp = gr.HTML()
    with gr.Row():
        text_out = gr.Textbox()
        sum_out = gr.Textbox()
    go_btn.click(scrape,inp,outp)
    sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
app.queue(concurrency_count=10).launch()