File size: 3,762 Bytes
f92f684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys
import camelot
import polars as pl
import signal
import argparse
from rich.console import Console
from rich.progress import track

console = Console()

class PDFTableParser:
    def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages):
        self.input_files = input_files
        self.output_files = output_files
        self.delimiter = delimiter
        self.edge_tol = edge_tol
        self.row_tol = row_tol
        self.pages = pages

    def read_tables(self, file_name):
        try:
            console.print(f"Reading tables from {file_name}...")
            tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages)
            console.print(f"Found {len(tables)} tables in {file_name}.")
            return tables
        except Exception as e:
            console.print(f"[red]Error reading {file_name}: {e}[/red]")
            return None

    def save_tables_as_csv(self, tables, output_file):
        try:
            console.print(f"Saving tables to {output_file}...")
            df = pl.concat([pl.DataFrame(table.df) for table in tables])
            df.write_csv(output_file, separator=self.delimiter)
            console.print(f"Saved tables to {output_file}.")
        except Exception as e:
            console.print(f"[red]Error saving to {output_file}: {e}[/red]")

    def estimate_processing_time(self, file_name):
        try:
            with open(file_name, 'rb') as f:
                content = f.read().decode('utf-8', errors='ignore')
            pages = content.count('\n')
            words = len(content.split())
            chars = len(content)
            estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000)
            console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.")
            return estimated_time
        except Exception as e:
            console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]")
            return 0

    def process_files(self):
        for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"):
            self.estimate_processing_time(input_file)
            tables = self.read_tables(input_file)
            if tables:
                self.save_tables_as_csv(tables, output_file)

def handle_signal(signum, frame):
    console.print("\n[red]Process interrupted.[/red]")
    sys.exit(1)

if __name__ == "__main__":
    signal.signal(signal.SIGINT, handle_signal)
    signal.signal(signal.SIGTERM, handle_signal)

    parser = argparse.ArgumentParser(description="PDF Table Parser")
    parser.add_argument("input_files", nargs='+', help="List of input PDF files")
    parser.add_argument("output_files", nargs='+', help="List of output CSV files")
    parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)")
    parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)")
    parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)")
    parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)")
    parser.add_argument("--webui", action='store_true', help="Launch the web UI")

    args = parser.parse_args()

    if len(args.input_files) != len(args.output_files):
        console.print("[red]The number of input files and output files must match.[/red]")
        sys.exit(1)

    if args.webui:
        webui = WebUI()
        webui.run()
    else:
        main(args)