PDFtoCSVForFD

Runtime error

App Files Files Community

danial0203 commited on Mar 13, 2024

Commit

41f37dc

verified ·

1 Parent(s): 116beb9

Create app.py

Browse files

Files changed (1) hide show

app.py +129 -0

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import boto3
+import csv
+import os
+from botocore.exceptions import NoCredentialsError
+from pdf2image import convert_from_path
+from PIL import Image
+import gradio as gr
+from io import BytesIO
+# AWS Setup
+aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
+aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
+region_name = os.getenv('AWS_REGION')
+s3_bucket = os.getenv('AWS_BUCKET')
+textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
+s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
+def upload_file_to_s3(file_content, bucket, object_name=None):
+    if object_name is None:
+        object_name = os.path.basename(file_content)
+    try:
+        s3_client.upload_fileobj(file_content, bucket, object_name)
+        return object_name
+    except FileNotFoundError:
+        print("The file was not found")
+        return None
+    except NoCredentialsError:
+        print("Credentials not available")
+        return None
+def process_image(file_content, s3_bucket, textract_client, object_name):
+    s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
+    if not s3_object_key:
+        return None
+    # Call Textract
+    response = textract_client.analyze_document(
+        Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
+        FeatureTypes=["TABLES"]
+    )
+    return response
+def generate_table_csv(tables, blocks_map, writer):
+    for table in tables:
+        rows = get_rows_columns_map(table, blocks_map)
+        for row_index, cols in rows.items():
+            row = []
+            for col_index in range(1, max(cols.keys()) + 1):
+                row.append(cols.get(col_index, ""))
+            writer.writerow(row)
+def get_rows_columns_map(table_result, blocks_map):
+    rows = {}
+    for relationship in table_result['Relationships']:
+        if relationship['Type'] == 'CHILD':
+            for child_id in relationship['Ids']:
+                cell = blocks_map[child_id]
+                if 'RowIndex' in cell and 'ColumnIndex' in cell:
+                    row_index = cell['RowIndex']
+                    col_index = cell['ColumnIndex']
+                    if row_index not in rows:
+                        rows[row_index] = {}
+                    rows[row_index][col_index] = get_text(cell, blocks_map)
+    return rows
+def get_text(result, blocks_map):
+    text = ''
+    if 'Relationships' in result:
+        for relationship in result['Relationships']:
+            if relationship['Type'] == 'CHILD':
+                for child_id in relationship['Ids']:
+                    word = blocks_map[child_id]
+                    if word['BlockType'] == 'WORD':
+                        text += word['Text'] + ' '
+                    if word['BlockType'] == 'SELECTION_ELEMENT':
+                        if word['SelectionStatus'] == 'SELECTED':
+                            text += 'X '
+    return text.strip()
+def is_image_file(filename):
+    image_file_extensions = ['png', 'jpg', 'jpeg']
+    return any(filename.lower().endswith(ext) for ext in image_file_extensions)
+def process_file_and_generate_csv(input_file):
+    output_csv_path = "output.csv"  # Output CSV file name
+    file_content = BytesIO(input_file.read())  # Read file content into memory for processing
+    file_content.seek(0)  # Go to the start of the file-like object
+    object_name = os.path.basename(input_file.name)
+    # Check if the uploaded file is an image or needs conversion
+    images = []
+    if is_image_file(object_name):
+        images.append(Image.open(file_content))
+        file_content.seek(0)  # Reset for potential re-use
+    else:
+        # Convert PDF/TIFF to images
+        images.extend(convert_from_path(file_content))
+    csv_output = BytesIO()
+    writer = csv.writer(csv_output)
+    for i, image in enumerate(images):
+        # Process each image and upload to S3 for Textract processing
+        image_byte_array = BytesIO()
+        image.save(image_byte_array, format='JPEG')
+        image_byte_array.seek(0)
+        response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
+        if response:
+            blocks = response['Blocks']
+            blocks_map = {block['Id']: block for block in blocks}
+            tables = [block for block in blocks if block['BlockType'] == "TABLE"]
+            generate_table_csv(tables, blocks_map, writer)
+    csv_output.seek(0)  # Go to the start of the CSV in-memory file
+    return csv_output, output_csv_path
+# Gradio Interface
+iface = gr.Interface(
+    fn=process_file_and_generate_csv,
+    inputs=gr.File(label="Upload your file (PDF, PNG, JPG, TIFF)"),
+    outputs=[gr.File(label="Download Generated CSV"), "text"],
+    description="Upload a document to extract tables into a CSV file."
+)
+# Launch the interface
+iface.launch()