Omnibus commited on
Commit
9774c1c
·
1 Parent(s): 1c5b68c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -2
app.py CHANGED
@@ -1,6 +1,97 @@
1
  import gradio as gr
2
  import requests
3
  from pypdf import PdfReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def scrape(instring):
6
  html_src=(f'''
@@ -10,7 +101,7 @@ def scrape(instring):
10
  </div>''')
11
  return gr.HTML.update(f'''{html_src}''')
12
 
13
- def scrape00(instring, page_num):
14
  response = requests.get(instring, stream=True)
15
 
16
  if response.status_code == 200:
@@ -31,6 +122,7 @@ def scrape00(instring, page_num):
31
  summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
32
  sum_out = summarizer(text)
33
  except Exception:
 
34
  sum_out = "Error"
35
  return text, sum_out
36
 
@@ -42,10 +134,12 @@ with gr.Blocks() as app:
42
  with gr.Row():
43
  go_btn = gr.Button("Load PDF")
44
  sum_btn = gr.Button("Summarize")
 
 
45
  outp = gr.HTML()
46
  with gr.Row():
47
  text_out = gr.Textbox()
48
  sum_out = gr.Textbox()
49
  go_btn.click(scrape,inp,outp)
50
- sum_btn.click(scrape00,[inp,pg_num],[text_out,sum_out])
51
  app.queue(concurrency_count=10).launch()
 
1
  import gradio as gr
2
  import requests
3
  from pypdf import PdfReader
4
+ import pypdfium2 as pdfium
5
+
6
+ import easyocr
7
+
8
+ ocr_id = {
9
+ "Afrikaans": "af",
10
+ "Albanian": "sq",
11
+ "Arabic": "ar",
12
+ "Azerbaijani": "az",
13
+ "Belarusian": "be",
14
+ "Bulgarian": "bg",
15
+ "Bengali": "bn",
16
+ "Bosnian": "bs",
17
+ "Chinese (simplified)": "ch_sim",
18
+ "Chinese (traditional)": "ch_tra",
19
+ "Croatian": "hr",
20
+ "Czech": "cs",
21
+ "Danish": "da",
22
+ "Dutch": "nl",
23
+ "English": "en",
24
+ "Estonian": "et",
25
+ "French": "fr",
26
+ "German": "de",
27
+ "Irish": "ga",
28
+ "Hindi": "hi",
29
+ "Hungarian": "hu",
30
+ "Indonesian": "id",
31
+ "Icelandic": "is",
32
+ "Italian": "it",
33
+ "Japanese": "ja",
34
+ "Kannada": "kn",
35
+ "Korean": "ko",
36
+ "Lithuanian": "lt",
37
+ "Latvian": "lv",
38
+ "Mongolian": "mn",
39
+ "Marathi": "mr",
40
+ "Malay": "ms",
41
+ "Nepali": "ne",
42
+ "Norwegian": "no",
43
+ "Occitan": "oc",
44
+ "Polish": "pl",
45
+ "Portuguese": "pt",
46
+ "Romanian": "ro",
47
+ "Russian": "ru",
48
+ "Serbian (cyrillic)": "rs_cyrillic",
49
+ "Serbian (latin)": "rs_latin",
50
+ "Slovak": "sk",
51
+ "Slovenian": "sl",
52
+ "Spanish": "es",
53
+ "Swedish": "sv",
54
+ "Swahili": "sw",
55
+ "Tamil": "ta",
56
+ "Thai": "th",
57
+ "Tagalog": "tl",
58
+ "Turkish": "tr",
59
+ "Ukrainian": "uk",
60
+ "Urdu": "ur",
61
+ "Uzbek": "uz",
62
+ "Vietnamese": "vi",
63
+ "Welsh": "cy",
64
+ "Zulu": "zu",
65
+ }
66
+
67
+ def pdf_pil(file_path,page_num):
68
+
69
+ pdf = pdfium.PdfDocument(f"{file_path}")
70
+ #n_pages = len(pdf)
71
+ #for page_number in range(n_pages):
72
+ page = pdf.get_page(page_num)
73
+ pil_image = page.render_topil(
74
+ scale=1,
75
+ rotation=0,
76
+ crop=(0, 0, 0, 0),
77
+ colour=(255, 255, 255, 255),
78
+ annotations=True,
79
+ greyscale=False,
80
+ optimise_mode=pdfium.OptimiseMode.NONE,
81
+ )
82
+ #pil_image.save(f"image_{page_num}.png")
83
+
84
+ return pil_image
85
+
86
+ def ocrpdf(file_path,pdf_lang,page_num):
87
+ img1=pdf_pil(file_path,page_num)
88
+ lang=[f"{ocr_id[pdf_lang]}"]
89
+ reader = easyocr.Reader(lang)
90
+ bounds = reader.readtext(img1)
91
+
92
+ for bound in bounds:
93
+ print(bound[1])
94
+
95
 
96
  def scrape(instring):
97
  html_src=(f'''
 
101
  </div>''')
102
  return gr.HTML.update(f'''{html_src}''')
103
 
104
+ def scrape00(instring, page_num,pdf_lang):
105
  response = requests.get(instring, stream=True)
106
 
107
  if response.status_code == 200:
 
122
  summarizer = gr.Interface.load("huggingface/facebook/bart-large-cnn")
123
  sum_out = summarizer(text)
124
  except Exception:
125
+ ocr_pdf(data.pdf,pdf_lang,page_num)
126
  sum_out = "Error"
127
  return text, sum_out
128
 
 
134
  with gr.Row():
135
  go_btn = gr.Button("Load PDF")
136
  sum_btn = gr.Button("Summarize")
137
+ target_lang = gr.Dropdown(label="PDF Language", choices=list(ocr_id.keys()),value="English")
138
+
139
  outp = gr.HTML()
140
  with gr.Row():
141
  text_out = gr.Textbox()
142
  sum_out = gr.Textbox()
143
  go_btn.click(scrape,inp,outp)
144
+ sum_btn.click(scrape00,[inp,pg_num,target_lang],[text_out,sum_out])
145
  app.queue(concurrency_count=10).launch()