Spaces:
Runtime error
Runtime error
import argparse | |
import os | |
import signal | |
import sys | |
import json | |
import time | |
import tempfile | |
import zipfile | |
from rich.console import Console | |
from rich.progress import track | |
import camelot | |
import polars as pl | |
import gradio as gr | |
from gradio_pdf import PDF | |
console = Console() | |
class Interface: | |
def get_tempdir(): | |
timestamp = int(time.time()) | |
temp_dir = tempfile.mkdtemp() | |
return timestamp, temp_dir | |
def create_zip(file_list, zip_path, password=None): | |
with zipfile.ZipFile(zip_path, "w", zipfilep64=True) as zipf: | |
if password: | |
zipf.setpassword(bytes(password, 'utf-8')) | |
for item in file_list: | |
if os.path.isdir(item): | |
for root, _, files in os.walk(item): | |
for file in files: | |
file_path = os.path.join(root, file) | |
arcname = os.path.relpath(file_path, item) | |
zipf.write(file_path, arcname) | |
else: | |
arcname = os.path.basename(item) | |
zipf.write(item, arcname) | |
class PDFTableParser: | |
def __init__(self, input_files, output_files, delimiter, edge_tol, row_tol, pages): | |
self.input_files = input_files | |
self.output_files = output_files | |
self.delimiter = delimiter | |
self.edge_tol = edge_tol | |
self.row_tol = row_tol | |
self.pages = pages | |
def read_tables(self, file_name): | |
try: | |
console.print(f"Reading tables from {file_name}...") | |
tables = camelot.read_pdf(file_name, flavor='stream', edge_tol=self.edge_tol, row_tol=self.row_tol, pages=self.pages) | |
console.print(f"Found {len(tables)} tables in {file_name}.") | |
return tables | |
except Exception as e: | |
console.print(f"[red]Error reading {file_name}: {e}[/red]") | |
return None | |
def save_tables_as_csv(self, tables, output_file): | |
try: | |
console.print(f"Saving tables to {output_file}...") | |
df = pl.concat([pl.DataFrame(table.df) for table in tables]) | |
df.write_csv(output_file, separator=self.delimiter) | |
console.print(f"Saved tables to {output_file}.") | |
except Exception as e: | |
console.print(f"[red]Error saving to {output_file}: {e}[/red]") | |
def estimate_processing_time(self, file_name): | |
try: | |
with open(file_name, 'rb') as f: | |
content = f.read().decode('utf-8', errors='ignore') | |
pages = content.count('\n') | |
words = len(content.split()) | |
chars = len(content) | |
estimated_time = (lines / 1000) + (words / 1000) + (chars / 1000) | |
console.print(f"Estimated processing time for {file_name}: {estimated_time:.2f} seconds.") | |
return estimated_time | |
except Exception as e: | |
console.print(f"[red]Error estimating processing time for {file_name}: {e}[/red]") | |
return 0 | |
def process_files(self): | |
for input_file, output_file in track(zip(self.input_files, self.output_files), description="Processing files"): | |
self.estimate_processing_time(input_file) | |
tables = self.read_tables(input_file) | |
if tables: | |
self.save_tables_as_csv(tables, output_file) | |
class WebUI: | |
def __init__(self): | |
pass | |
def process_pdf(pdf_file, output_path, edge_tol, row_tol, pages): | |
ts, tempd = Interface.get_tempdir() | |
tempf = os.path.join(tempd, output_path) | |
parser = PDFTableParser([pdf_file], [tempf], ',', edge_tol, row_tol, pages) | |
tables = parser.read_tables(pdf_file) | |
if tables: | |
parser.save_tables_as_csv(tables, tempf) | |
df = pl.concat([pl.DataFrame(table.df) for table in tables]) | |
return df, [tempf], {"status": "success", "message": f"Processed PDF and saved as {tempf}"} | |
return None, None, {"status": "error", "message": "Failed to process PDF"} | |
def run(self): | |
with gr.Blocks(title="PDF Table Parser", css="body { font-family: Arial, sans-serif; } footer { visibility: hidden; }") as app: | |
gr.Markdown("# PDF Table Parser") | |
description="Upload a PDF file to extract tables" | |
gr.Markdown(f"### {description}") | |
with gr.Row(): | |
with gr.Column(): | |
pdf_in = PDF(label="Document") | |
with gr.Row(): | |
edge_tol = gr.Number(50, label="Edge tol") | |
row_tol = gr.Number(50, label="Row tol") | |
pages = gr.Textbox('1', label="Pages", info="You can pass 'all', '3-end', etc.") | |
output_path = gr.Textbox(f"output.csv", label="Output Path") | |
with gr.Column(): | |
status_msg = gr.JSON(label="Status Message") | |
output_files = gr.Files(label="Output Files") | |
with gr.Row(): | |
output_df = gr.Dataframe(label="Extracted Table") | |
examples = gr.Examples([["data/demo.pdf"]], inputs=pdf_in) | |
pdf_in.change(WebUI.process_pdf, | |
inputs=[pdf_in, output_path, edge_tol, row_tol, pages], | |
outputs=[output_df, output_files, status_msg]) | |
app.launch() | |
def handle_signal(signum, frame): | |
console.print("\n[red]Process interrupted.[/red]") | |
sys.exit(1) | |
def main(args): | |
parser = PDFTableParser(args.input_files, args.output_files, args.delimiter, args.edge_tol, args.row_tol, args.pages) | |
parser.process_files() | |
if __name__ == "__main__": | |
signal.signal(signal.SIGINT, handle_signal) | |
signal.signal(signal.SIGTERM, handle_signal) | |
parser = argparse.ArgumentParser(description="PDF Table Parser") | |
parser.add_argument("input_files", nargs='+', help="List of input PDF files") | |
parser.add_argument("output_files", nargs='+', help="List of output CSV files") | |
parser.add_argument("--delimiter", default=',', help="Output file delimiter (default: ,)") | |
parser.add_argument("--edge_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between text and table edges (default: 50)") | |
parser.add_argument("--row_tol", type=int, default=50, help="Tolerance parameter used to specify the distance between table rows (default: 50)") | |
parser.add_argument("--pages", type=str, default='all', help="Pages you can pass the number of pages to process. (default: all)") | |
parser.add_argument("--webui", action='store_true', help="Launch the web UI") | |
args = parser.parse_args() | |
if len(args.input_files) != len(args.output_files): | |
console.print("[red]The number of input files and output files must match.[/red]") | |
sys.exit(1) | |
if args.webui: | |
webui = WebUI() | |
webui.run() | |
else: | |
main(args) | |