danial0203 commited on
Commit
41f37dc
·
verified ·
1 Parent(s): 116beb9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import csv
3
+ import os
4
+ from botocore.exceptions import NoCredentialsError
5
+ from pdf2image import convert_from_path
6
+ from PIL import Image
7
+ import gradio as gr
8
+ from io import BytesIO
9
+
10
+ # AWS Setup
11
+ aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
12
+ aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
13
+ region_name = os.getenv('AWS_REGION')
14
+ s3_bucket = os.getenv('AWS_BUCKET')
15
+
16
+ textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
17
+ s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
18
+
19
+ def upload_file_to_s3(file_content, bucket, object_name=None):
20
+ if object_name is None:
21
+ object_name = os.path.basename(file_content)
22
+ try:
23
+ s3_client.upload_fileobj(file_content, bucket, object_name)
24
+ return object_name
25
+ except FileNotFoundError:
26
+ print("The file was not found")
27
+ return None
28
+ except NoCredentialsError:
29
+ print("Credentials not available")
30
+ return None
31
+
32
+ def process_image(file_content, s3_bucket, textract_client, object_name):
33
+ s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
34
+ if not s3_object_key:
35
+ return None
36
+
37
+ # Call Textract
38
+ response = textract_client.analyze_document(
39
+ Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
40
+ FeatureTypes=["TABLES"]
41
+ )
42
+ return response
43
+
44
+ def generate_table_csv(tables, blocks_map, writer):
45
+ for table in tables:
46
+ rows = get_rows_columns_map(table, blocks_map)
47
+ for row_index, cols in rows.items():
48
+ row = []
49
+ for col_index in range(1, max(cols.keys()) + 1):
50
+ row.append(cols.get(col_index, ""))
51
+ writer.writerow(row)
52
+
53
+ def get_rows_columns_map(table_result, blocks_map):
54
+ rows = {}
55
+ for relationship in table_result['Relationships']:
56
+ if relationship['Type'] == 'CHILD':
57
+ for child_id in relationship['Ids']:
58
+ cell = blocks_map[child_id]
59
+ if 'RowIndex' in cell and 'ColumnIndex' in cell:
60
+ row_index = cell['RowIndex']
61
+ col_index = cell['ColumnIndex']
62
+ if row_index not in rows:
63
+ rows[row_index] = {}
64
+ rows[row_index][col_index] = get_text(cell, blocks_map)
65
+ return rows
66
+
67
+ def get_text(result, blocks_map):
68
+ text = ''
69
+ if 'Relationships' in result:
70
+ for relationship in result['Relationships']:
71
+ if relationship['Type'] == 'CHILD':
72
+ for child_id in relationship['Ids']:
73
+ word = blocks_map[child_id]
74
+ if word['BlockType'] == 'WORD':
75
+ text += word['Text'] + ' '
76
+ if word['BlockType'] == 'SELECTION_ELEMENT':
77
+ if word['SelectionStatus'] == 'SELECTED':
78
+ text += 'X '
79
+ return text.strip()
80
+
81
+ def is_image_file(filename):
82
+ image_file_extensions = ['png', 'jpg', 'jpeg']
83
+ return any(filename.lower().endswith(ext) for ext in image_file_extensions)
84
+
85
+ def process_file_and_generate_csv(input_file):
86
+ output_csv_path = "output.csv" # Output CSV file name
87
+ file_content = BytesIO(input_file.read()) # Read file content into memory for processing
88
+ file_content.seek(0) # Go to the start of the file-like object
89
+
90
+ object_name = os.path.basename(input_file.name)
91
+
92
+ # Check if the uploaded file is an image or needs conversion
93
+ images = []
94
+ if is_image_file(object_name):
95
+ images.append(Image.open(file_content))
96
+ file_content.seek(0) # Reset for potential re-use
97
+ else:
98
+ # Convert PDF/TIFF to images
99
+ images.extend(convert_from_path(file_content))
100
+
101
+ csv_output = BytesIO()
102
+ writer = csv.writer(csv_output)
103
+
104
+ for i, image in enumerate(images):
105
+ # Process each image and upload to S3 for Textract processing
106
+ image_byte_array = BytesIO()
107
+ image.save(image_byte_array, format='JPEG')
108
+ image_byte_array.seek(0)
109
+
110
+ response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
111
+ if response:
112
+ blocks = response['Blocks']
113
+ blocks_map = {block['Id']: block for block in blocks}
114
+ tables = [block for block in blocks if block['BlockType'] == "TABLE"]
115
+ generate_table_csv(tables, blocks_map, writer)
116
+
117
+ csv_output.seek(0) # Go to the start of the CSV in-memory file
118
+ return csv_output, output_csv_path
119
+
120
+ # Gradio Interface
121
+ iface = gr.Interface(
122
+ fn=process_file_and_generate_csv,
123
+ inputs=gr.File(label="Upload your file (PDF, PNG, JPG, TIFF)"),
124
+ outputs=[gr.File(label="Download Generated CSV"), "text"],
125
+ description="Upload a document to extract tables into a CSV file."
126
+ )
127
+
128
+ # Launch the interface
129
+ iface.launch()