Teera commited on
Commit
3c77bf4
·
verified ·
1 Parent(s): bdab50e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -91
app.py CHANGED
@@ -1,91 +1,89 @@
1
- import subprocess
2
- import json
3
- import os
4
- import gradio as gr
5
- from PyPDF2 import PdfReader, PdfWriter
6
-
7
- !pip install pandas
8
-
9
- # Function to split PDF into batches of 3 pages
10
- def split_pdf(file_path, batch_size=3):
11
- pdf_reader = PdfReader(open(file_path, "rb"))
12
- total_pages = len(pdf_reader.pages)
13
- pdf_batches = []
14
-
15
- # Split the PDF into batches of 3 pages
16
- for i in range(0, total_pages, batch_size):
17
- pdf_writer = PdfWriter()
18
- for j in range(i, min(i + batch_size, total_pages)):
19
- pdf_writer.add_page(pdf_reader.pages[j])
20
-
21
- batch_path = f"./temp_batch_{i // batch_size}.pdf"
22
- with open(batch_path, "wb") as batch_file:
23
- pdf_writer.write(batch_file)
24
-
25
- pdf_batches.append(batch_path)
26
-
27
- return pdf_batches
28
-
29
- # Function to process the PDF batch using subprocess
30
- def process_pdf_batch(batch_path, output_dir):
31
- # Extract the base name of the batch file
32
- pdf_name = os.path.basename(batch_path).split('.')[0]
33
- result_path = os.path.join(output_dir, pdf_name, "results.json")
34
-
35
- # Build the OCR command
36
- ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]
37
-
38
- # Run the command using subprocess
39
- try:
40
- result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
41
- print("OCR Command Output:", result.stdout)
42
- except subprocess.CalledProcessError as e:
43
- return f"OCR processing failed: {e.stderr}"
44
-
45
- # After OCR processing, read the results from the JSON file
46
- if os.path.exists(result_path):
47
- with open(result_path, 'r', encoding="utf-8") as f:
48
- data = json.load(f)
49
-
50
- # Extract text from the JSON
51
- result_text = ''
52
- for page_data in data[pdf_name]:
53
- for line in page_data['text_lines']:
54
- result_text += line['text'] + '\n'
55
-
56
- return result_text
57
- else:
58
- return "OCR processing completed, but result file not found."
59
-
60
- # Main function to process the entire PDF in batches
61
- def process_pdf(file):
62
- # Define output directory
63
- output_dir = "./result"
64
-
65
- # Split the uploaded PDF into batches of 3 pages
66
- pdf_batches = split_pdf(file.name, batch_size=3)
67
-
68
- # Process each batch and accumulate results
69
- final_text = ""
70
- for batch_path in pdf_batches:
71
- batch_result = process_pdf_batch(batch_path, output_dir)
72
- final_text += batch_result + "\n"
73
-
74
- return final_text
75
-
76
- # Define Gradio interface
77
- def process_pdf_gradio(file):
78
- # Gradio handles the file upload differently, so process accordingly
79
- result = process_pdf(file)
80
- return result
81
-
82
- # Gradio app
83
- app = gr.Interface(
84
- fn=process_pdf_gradio,
85
- inputs=gr.File(label="Upload PDF"),
86
- outputs=gr.Textbox(label="Extracted Text"),
87
- title="PDF OCR Extractor"
88
- )
89
-
90
- # Launch the app with a specified port for Docker
91
- app.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
1
+ import subprocess
2
+ import json
3
+ import os
4
+ import gradio as gr
5
+ from PyPDF2 import PdfReader, PdfWriter
6
+
7
+ # Function to split PDF into batches of 3 pages
8
+ def split_pdf(file_path, batch_size=3):
9
+ pdf_reader = PdfReader(open(file_path, "rb"))
10
+ total_pages = len(pdf_reader.pages)
11
+ pdf_batches = []
12
+
13
+ # Split the PDF into batches of 3 pages
14
+ for i in range(0, total_pages, batch_size):
15
+ pdf_writer = PdfWriter()
16
+ for j in range(i, min(i + batch_size, total_pages)):
17
+ pdf_writer.add_page(pdf_reader.pages[j])
18
+
19
+ batch_path = f"./temp_batch_{i // batch_size}.pdf"
20
+ with open(batch_path, "wb") as batch_file:
21
+ pdf_writer.write(batch_file)
22
+
23
+ pdf_batches.append(batch_path)
24
+
25
+ return pdf_batches
26
+
27
+ # Function to process the PDF batch using subprocess
28
+ def process_pdf_batch(batch_path, output_dir):
29
+ # Extract the base name of the batch file
30
+ pdf_name = os.path.basename(batch_path).split('.')[0]
31
+ result_path = os.path.join(output_dir, pdf_name, "results.json")
32
+
33
+ # Build the OCR command
34
+ ocr_command = ["surya_ocr", batch_path, "--results_dir", output_dir]
35
+
36
+ # Run the command using subprocess
37
+ try:
38
+ result = subprocess.run(ocr_command, check=True, text=True, capture_output=True,encoding="utf-8")
39
+ print("OCR Command Output:", result.stdout)
40
+ except subprocess.CalledProcessError as e:
41
+ return f"OCR processing failed: {e.stderr}"
42
+
43
+ # After OCR processing, read the results from the JSON file
44
+ if os.path.exists(result_path):
45
+ with open(result_path, 'r', encoding="utf-8") as f:
46
+ data = json.load(f)
47
+
48
+ # Extract text from the JSON
49
+ result_text = ''
50
+ for page_data in data[pdf_name]:
51
+ for line in page_data['text_lines']:
52
+ result_text += line['text'] + '\n'
53
+
54
+ return result_text
55
+ else:
56
+ return "OCR processing completed, but result file not found."
57
+
58
+ # Main function to process the entire PDF in batches
59
+ def process_pdf(file):
60
+ # Define output directory
61
+ output_dir = "./result"
62
+
63
+ # Split the uploaded PDF into batches of 3 pages
64
+ pdf_batches = split_pdf(file.name, batch_size=3)
65
+
66
+ # Process each batch and accumulate results
67
+ final_text = ""
68
+ for batch_path in pdf_batches:
69
+ batch_result = process_pdf_batch(batch_path, output_dir)
70
+ final_text += batch_result + "\n"
71
+
72
+ return final_text
73
+
74
+ # Define Gradio interface
75
+ def process_pdf_gradio(file):
76
+ # Gradio handles the file upload differently, so process accordingly
77
+ result = process_pdf(file)
78
+ return result
79
+
80
+ # Gradio app
81
+ app = gr.Interface(
82
+ fn=process_pdf_gradio,
83
+ inputs=gr.File(label="Upload PDF"),
84
+ outputs=gr.Textbox(label="Extracted Text"),
85
+ title="PDF OCR Extractor"
86
+ )
87
+
88
+ # Launch the app with a specified port for Docker
89
+ app.launch(server_name="0.0.0.0", server_port=7860, share=True)