Commit
·
e4157cf
1
Parent(s):
088231a
Code Modified
Browse files
app.py
CHANGED
@@ -1,18 +1,28 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
|
4 |
|
5 |
def read_pdf(file):
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
iface = gr.Interface(
|
10 |
read_pdf,
|
11 |
gr.inputs.File(label="Upload a PDF file"),
|
12 |
-
gr.outputs.Textbox(label="Extracted
|
13 |
title="PDF Text Extractor",
|
14 |
-
description="A smooth app that gets text from PDF files using pdfminer
|
15 |
-
theme="ParityError/Anime"
|
16 |
-
|
|
|
17 |
)
|
18 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import PyPDF2
|
3 |
+
import re
|
4 |
|
5 |
def read_pdf(file):
|
6 |
+
with open(file.name, 'rb') as f:
|
7 |
+
reader = PyPDF2.PdfReader(f)
|
8 |
+
paragraphs = []
|
9 |
+
for page in reader.pages:
|
10 |
+
extracted_text = page.extract_text()
|
11 |
+
formatted_text = re.sub(r'\n+', '\n', extracted_text) # Replace multiple consecutive line breaks with a single line break
|
12 |
+
paragraphs.append(formatted_text.strip())
|
13 |
+
|
14 |
+
formatted_text = "\n\n".join(paragraphs) # Join paragraphs with double line breaks
|
15 |
+
|
16 |
+
return formatted_text
|
17 |
|
18 |
iface = gr.Interface(
|
19 |
read_pdf,
|
20 |
gr.inputs.File(label="Upload a PDF file"),
|
21 |
+
gr.outputs.Textbox(label="Extracted Text", lines=10, type="auto", spellcheck=True, readonly=True),
|
22 |
title="PDF Text Extractor",
|
23 |
+
description="A smooth app that gets text from PDF files using pdfminer 🧠",
|
24 |
+
theme="ParityError/Anime",
|
25 |
+
layout="vertical",
|
26 |
+
width="600px"
|
27 |
)
|
28 |
iface.launch()
|