Spaces:
Runtime error
Runtime error
File size: 4,618 Bytes
41f37dc 8cd0fcc 41f37dc 8cd0fcc 41f37dc 8cd0fcc 41f37dc e003f08 41f37dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import boto3
import csv
import os
from botocore.exceptions import NoCredentialsError
from pdf2image import convert_from_path
from PIL import Image
import gradio as gr
from io import BytesIO
# AWS Setup
aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
region_name = os.getenv('AWS_REGION')
s3_bucket = os.getenv('AWS_BUCKET')
textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
def upload_file_to_s3(file_content, bucket, object_name=None):
if object_name is None:
object_name = os.path.basename(file_content)
try:
s3_client.upload_fileobj(file_content, bucket, object_name)
return object_name
except FileNotFoundError:
print("The file was not found")
return None
except NoCredentialsError:
print("Credentials not available")
return None
def process_image(file_content, s3_bucket, textract_client, object_name):
s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
if not s3_object_key:
return None
# Call Textract
response = textract_client.analyze_document(
Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
FeatureTypes=["TABLES"]
)
return response
def generate_table_csv(tables, blocks_map, writer):
for table in tables:
rows = get_rows_columns_map(table, blocks_map)
for row_index, cols in rows.items():
row = []
for col_index in range(1, max(cols.keys()) + 1):
row.append(cols.get(col_index, ""))
writer.writerow(row)
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if 'RowIndex' in cell and 'ColumnIndex' in cell:
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
rows[row_index] = {}
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] == 'SELECTED':
text += 'X '
return text.strip()
def is_image_file(filename):
image_file_extensions = ['png', 'jpg', 'jpeg']
return any(filename.lower().endswith(ext) for ext in image_file_extensions)
def process_file_and_generate_csv(input_file):
# Check if the uploaded file is an image or needs conversion to images
images = []
if is_image_file(input_file.name):
input_file.seek(0) # Go to the start of the file
images.append(Image.open(input_file))
else:
input_file.seek(0) # Ensure we're at the start of the file
images.extend(convert_from_bytes(input_file.read()))
csv_output = BytesIO()
writer = csv.writer(csv_output)
for i, image in enumerate(images):
image_byte_array = BytesIO()
image.save(image_byte_array, format='JPEG')
image_byte_array.seek(0)
response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
if response:
blocks = response['Blocks']
blocks_map = {block['Id']: block for block in blocks}
tables = [block for block in blocks if block['BlockType'] == "TABLE"]
generate_table_csv(tables, blocks_map, writer)
csv_output.seek(0)
return csv_output, output_csv_path
# Gradio Interface
iface = gr.Interface(
fn=process_file_and_generate_csv,
inputs=gr.File(label="Upload your file (PDF, PNG, JPG, TIFF)"),
outputs=[gr.File(label="Download Generated CSV"), "text"],
description="Upload a document to extract tables into a CSV file."
)
# Launch the interface
iface.launch()
|