Spaces:
Runtime error
Runtime error
morisono
commited on
Upload folder using huggingface_hub
Browse files- README.md +4 -23
- src/app/__pycache__/common.cpython-310.pyc +0 -0
- src/app/__pycache__/parser.cpython-310.pyc +0 -0
- src/app/common.py +24 -0
- src/app/parser.py +87 -0
- src/app/run.py +4 -106
README.md
CHANGED
@@ -32,7 +32,7 @@ This script extracts tables from PDF files and saves them as CSV files. It suppo
|
|
32 |
To run the script via CLI, use the following command:
|
33 |
|
34 |
```bash
|
35 |
-
python src/app/
|
36 |
```
|
37 |
|
38 |
#### Arguments:
|
@@ -52,7 +52,7 @@ python src/app/run.py input1.pdf input2.pdf output1.csv output2.csv
|
|
52 |
To run the script with the web UI, use the following command:
|
53 |
|
54 |
```bash
|
55 |
-
python src/app/run.py
|
56 |
```
|
57 |
|
58 |
This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
|
@@ -62,28 +62,9 @@ This will launch a Gradio-based web application where you can upload PDFs and vi
|
|
62 |
### CLI Example
|
63 |
|
64 |
```bash
|
65 |
-
python src/app/
|
66 |
```
|
67 |
|
68 |
-
### Web UI Example
|
69 |
-
|
70 |
-
```bash
|
71 |
-
python src/app/run.py data/demo.pdf data/output.csv --webui
|
72 |
-
```
|
73 |
-
|
74 |
-
## Handling Interruptions
|
75 |
-
|
76 |
-
The script handles `SIGINT` and `SIGTERM` signals gracefully, ensuring that processing can be interrupted safely.
|
77 |
-
|
78 |
## License
|
79 |
|
80 |
-
This project is licensed under the MIT License.
|
81 |
-
|
82 |
-
## Acknowledgements
|
83 |
-
|
84 |
-
This script uses the following libraries:
|
85 |
-
- [Rich](https://github.com/willmcgugan/rich) for console output and progress bars
|
86 |
-
- [Camelot](https://github.com/camelot-dev/camelot) for PDF table extraction
|
87 |
-
- [Polars](https://github.com/pola-rs/polars) for efficient DataFrame operations
|
88 |
-
- [Gradio](https://github.com/gradio-app/gradio) for the web UI
|
89 |
-
- [gradio_pdf](https://github.com/gradio-app/gradio) for PDF handling in Gradio
|
|
|
32 |
To run the script via CLI, use the following command:
|
33 |
|
34 |
```bash
|
35 |
+
python src/app/parser.py input1.pdf input2.pdf output1.csv output2.csv
|
36 |
```
|
37 |
|
38 |
#### Arguments:
|
|
|
52 |
To run the script with the web UI, use the following command:
|
53 |
|
54 |
```bash
|
55 |
+
python src/app/run.py
|
56 |
```
|
57 |
|
58 |
This will launch a Gradio-based web application where you can upload PDFs and view the extracted tables interactively.
|
|
|
62 |
### CLI Example
|
63 |
|
64 |
```bash
|
65 |
+
python src/app/parser.py data/demo.pdf data/output.csv --delimiter ";" --edge_tol 60 --row_tol 40
|
66 |
```
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
## License
|
69 |
|
70 |
+
This project is licensed under the MIT License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app/__pycache__/common.cpython-310.pyc
ADDED
Binary file (1.1 kB). View file
|
|
src/app/__pycache__/parser.cpython-310.pyc
ADDED
Binary file (4.1 kB). View file
|
|
src/app/common.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import tempfile
|
3 |
+
import zipfile
|
4 |
+
|
5 |
+
class Interface:
|
6 |
+
def get_tempdir():
|
7 |
+
timestamp = int(time.time())
|
8 |
+
temp_dir = tempfile.mkdtemp()
|
9 |
+
return timestamp, temp_dir
|
10 |
+
|
11 |
+
def create_zip(file_list, zip_path, password=None):
|
12 |
+
with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
|
13 |
+
if password:
|
14 |
+
zipf.setpassword(bytes(password, 'utf-8'))
|
15 |
+
for item in file_list:
|
16 |
+
if os.path.isdir(item):
|
17 |
+
for root, _, files in os.walk(item):
|
18 |
+
for file in files:
|
19 |
+
file_path = os.path.join(root, file)
|
20 |
+
arcname = os.path.relpath(file_path, item)
|
21 |
+
zipf.write(file_path, arcname)
|
22 |
+
else:
|
23 |
+
arcname = os.path.basename(item)
|
24 |
+
zipf.write(item, arcname)
|
src/app/parser.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import camelot
|
3 |
+
import polars as pl
|
4 |
+
import signal
|
5 |
+
import argparse
|
6 |
+
from rich.console import Console
|
7 |
+
from rich.progress import track
|
8 |
+
|
9 |
+
console = Console()
|
10 |
+
|
11 |
+
class PDFTableParser:
|
12 |
+
def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
|
13 |
+
self.input_files = input_files
|
14 |
+
self.output_files = output_files
|
15 |
+
self.delimiter = delimiter
|
16 |
+
self.edge_tol = edge_tol
|
17 |
+
self.row_tol = row_tol
|
18 |
+
self.pages = pages
|
19 |
+
|
20 |
+
def read_tables(self, file_name):
|
21 |
+
try:
|
22 |
+
console.print(f"Reading tables from {file_name}...")
|
23 |
+
tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
|
24 |
+
console.print(f"Found {len(tables)} tables in {file_name}.")
|
25 |
+
return tables
|
26 |
+
except Exception as e:
|
27 |
+
console.print(f"[red]Error reading {file_name}: {e}[/red]")
|
28 |
+
return None
|
29 |
+
|
30 |
+
def save_tables_as_csv(self, tables, output_file):
|
31 |
+
try:
|
32 |
+
console.print(f"Saving tables to {output_file}...")
|
33 |
+
df = pl.concat([pl.DataFrame(table.df) for table in tables])
|
34 |
+
df.write_csv(output_file, separator=self.delimiter)
|
35 |
+
console.print(f"Saved tables to {output_file}.")
|
36 |
+
except Exception as e:
|
37 |
+
console.print(f"[red]Error saving to {output_file}: {e}[/red]")
|
38 |
+
|
39 |
+
def estimate_processing_time(self, file_name):
|
40 |
+
try:
|
41 |
+
with open(file_name, 'rb') as f:
|
42 |
+
content = f.read().decode('utf-8', errors='ignore')
|
43 |
+
pages = content.count('\n')
|
44 |
+
words = len(content.split())
|
45 |
+
chars = len(content)
|
46 |
+
estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
|
47 |
+
console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
|
48 |
+
return estimated_time
|
49 |
+
except Exception as e:
|
50 |
+
console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
|
51 |
+
return 0
|
52 |
+
|
53 |
+
def process_files(self):
|
54 |
+
for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
|
55 |
+
self.estimate_processing_time(input_file)
|
56 |
+
tables = self.read_tables(input_file)
|
57 |
+
if tables:
|
58 |
+
self.save_tables_as_csv(tables, output_file)
|
59 |
+
|
60 |
+
def handle_signal(signum, frame):
|
61 |
+
console.print("\n[red]Process interrupted.[/red]")
|
62 |
+
sys.exit(1)
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
signal.signal(signal.SIGINT, handle_signal)
|
66 |
+
signal.signal(signal.SIGTERM, handle_signal)
|
67 |
+
|
68 |
+
parser = argparse.ArgumentParser(description="PDF Table Parser")
|
69 |
+
parser.add_argument("input_files", nargs='+', help="List of input PDF files")
|
70 |
+
parser.add_argument("output_files", nargs='+', help="List of output CSV files")
|
71 |
+
parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
|
72 |
+
parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
|
73 |
+
parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
|
74 |
+
parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
|
75 |
+
parser.add_argument("--webui", action='store_true', help="Launch the web UI")
|
76 |
+
|
77 |
+
args = parser.parse_args()
|
78 |
+
|
79 |
+
if len(args.input_files) != len(args.output_files):
|
80 |
+
console.print("[red]The number of input files and output files must match.[/red]")
|
81 |
+
sys.exit(1)
|
82 |
+
|
83 |
+
if args.webui:
|
84 |
+
webui = WebUI()
|
85 |
+
webui.run()
|
86 |
+
else:
|
87 |
+
main(args)
|
src/app/run.py
CHANGED
@@ -1,89 +1,12 @@
|
|
1 |
-
import argparse
|
2 |
import os
|
3 |
-
import signal
|
4 |
-
import sys
|
5 |
import json
|
6 |
-
import time
|
7 |
-
import tempfile
|
8 |
-
import zipfile
|
9 |
-
from rich.console import Console
|
10 |
-
from rich.progress import track
|
11 |
-
import camelot
|
12 |
import polars as pl
|
13 |
import gradio as gr
|
14 |
from gradio_pdf import PDF
|
15 |
|
16 |
-
|
|
|
17 |
|
18 |
-
class Interface:
|
19 |
-
def get_tempdir():
|
20 |
-
timestamp = int(time.time())
|
21 |
-
temp_dir = tempfile.mkdtemp()
|
22 |
-
return timestamp, temp_dir
|
23 |
-
|
24 |
-
def create_zip(file_list, zip_path, password=None):
|
25 |
-
with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf:
|
26 |
-
if password:
|
27 |
-
zipf.setpassword(bytes(password, 'utf-8'))
|
28 |
-
for item in file_list:
|
29 |
-
if os.path.isdir(item):
|
30 |
-
for root, _, files in os.walk(item):
|
31 |
-
for file in files:
|
32 |
-
file_path = os.path.join(root, file)
|
33 |
-
arcname = os.path.relpath(file_path, item)
|
34 |
-
zipf.write(file_path, arcname)
|
35 |
-
else:
|
36 |
-
arcname = os.path.basename(item)
|
37 |
-
zipf.write(item, arcname)
|
38 |
-
|
39 |
-
class PDFTableParser:
|
40 |
-
def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
|
41 |
-
self.input_files = input_files
|
42 |
-
self.output_files = output_files
|
43 |
-
self.delimiter = delimiter
|
44 |
-
self.edge_tol = edge_tol
|
45 |
-
self.row_tol = row_tol
|
46 |
-
self.pages = pages
|
47 |
-
|
48 |
-
def read_tables(self, file_name):
|
49 |
-
try:
|
50 |
-
console.print(f"Reading tables from {file_name}...")
|
51 |
-
tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
|
52 |
-
console.print(f"Found {len(tables)} tables in {file_name}.")
|
53 |
-
return tables
|
54 |
-
except Exception as e:
|
55 |
-
console.print(f"[red]Error reading {file_name}: {e}[/red]")
|
56 |
-
return None
|
57 |
-
|
58 |
-
def save_tables_as_csv(self, tables, output_file):
|
59 |
-
try:
|
60 |
-
console.print(f"Saving tables to {output_file}...")
|
61 |
-
df = pl.concat([pl.DataFrame(table.df) for table in tables])
|
62 |
-
df.write_csv(output_file, separator=self.delimiter)
|
63 |
-
console.print(f"Saved tables to {output_file}.")
|
64 |
-
except Exception as e:
|
65 |
-
console.print(f"[red]Error saving to {output_file}: {e}[/red]")
|
66 |
-
|
67 |
-
def estimate_processing_time(self, file_name):
|
68 |
-
try:
|
69 |
-
with open(file_name, 'rb') as f:
|
70 |
-
content = f.read().decode('utf-8', errors='ignore')
|
71 |
-
pages = content.count('\n')
|
72 |
-
words = len(content.split())
|
73 |
-
chars = len(content)
|
74 |
-
estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
|
75 |
-
console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
|
76 |
-
return estimated_time
|
77 |
-
except Exception as e:
|
78 |
-
console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
|
79 |
-
return 0
|
80 |
-
|
81 |
-
def process_files(self):
|
82 |
-
for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
|
83 |
-
self.estimate_processing_time(input_file)
|
84 |
-
tables = self.read_tables(input_file)
|
85 |
-
if tables:
|
86 |
-
self.save_tables_as_csv(tables, output_file)
|
87 |
|
88 |
class WebUI:
|
89 |
def __init__(self):
|
@@ -128,35 +51,10 @@ class WebUI:
|
|
128 |
|
129 |
app.launch()
|
130 |
|
131 |
-
def handle_signal(signum, frame):
|
132 |
-
console.print("\n[red]Process interrupted.[/red]")
|
133 |
-
sys.exit(1)
|
134 |
-
|
135 |
def main(args):
|
136 |
parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
|
137 |
parser.process_files()
|
138 |
|
139 |
if __name__ == "__main__":
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
parser = argparse.ArgumentParser(description="PDF Table Parser")
|
144 |
-
parser.add_argument("input_files", nargs='+', help="List of input PDF files")
|
145 |
-
parser.add_argument("output_files", nargs='+', help="List of output CSV files")
|
146 |
-
parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
|
147 |
-
parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
|
148 |
-
parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
|
149 |
-
parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
|
150 |
-
parser.add_argument("--webui", action='store_true', help="Launch the web UI")
|
151 |
-
|
152 |
-
args = parser.parse_args()
|
153 |
-
|
154 |
-
if len(args.input_files) != len(args.output_files):
|
155 |
-
console.print("[red]The number of input files and output files must match.[/red]")
|
156 |
-
sys.exit(1)
|
157 |
-
|
158 |
-
if args.webui:
|
159 |
-
webui = WebUI()
|
160 |
-
webui.run()
|
161 |
-
else:
|
162 |
-
main(args)
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import polars as pl
|
4 |
import gradio as gr
|
5 |
from gradio_pdf import PDF
|
6 |
|
7 |
+
from common import Interface
|
8 |
+
from parser import PDFTableParser
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
class WebUI:
|
12 |
def __init__(self):
|
|
|
51 |
|
52 |
app.launch()
|
53 |
|
|
|
|
|
|
|
|
|
54 |
def main(args):
|
55 |
parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages)
|
56 |
parser.process_files()
|
57 |
|
58 |
if __name__ == "__main__":
|
59 |
+
webui = WebUI()
|
60 |
+
webui.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|