Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,63 +1,53 @@
|
|
1 |
-
import boto3
|
2 |
-
import csv
|
3 |
import os
|
4 |
-
from
|
|
|
|
|
5 |
from pdf2image import convert_from_path
|
|
|
6 |
from PIL import Image
|
7 |
import gradio as gr
|
8 |
-
from io import BytesIO
|
9 |
-
from datasets.filesystems import S3FileSystem
|
10 |
-
import s3fs
|
11 |
-
|
12 |
|
13 |
-
# AWS
|
14 |
aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
|
15 |
aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
|
16 |
region_name = os.getenv('AWS_REGION')
|
17 |
s3_bucket = os.getenv('AWS_BUCKET')
|
18 |
|
19 |
-
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
def upload_file_to_s3(file_content, bucket, object_name=None):
|
25 |
-
"""Uploads file to S3 using s3fs."""
|
26 |
if object_name is None:
|
27 |
-
|
28 |
-
|
29 |
try:
|
30 |
-
with
|
31 |
-
|
32 |
return object_name
|
33 |
except FileNotFoundError:
|
34 |
print("The file was not found")
|
35 |
return None
|
36 |
-
except Exception as e: # Catch broader exceptions that may arise from permissions or AWS issues
|
37 |
-
print(f"An error occurred: {e}")
|
38 |
-
return None
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
|
43 |
if not s3_object_key:
|
44 |
return None
|
45 |
|
46 |
-
# Call Textract
|
47 |
response = textract_client.analyze_document(
|
48 |
Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
|
49 |
FeatureTypes=["TABLES"]
|
50 |
)
|
51 |
return response
|
52 |
|
53 |
-
def generate_table_csv(tables, blocks_map,
|
54 |
-
|
55 |
-
|
56 |
-
for
|
57 |
-
|
58 |
-
for
|
59 |
-
row
|
60 |
-
|
|
|
|
|
61 |
|
62 |
def get_rows_columns_map(table_result, blocks_map):
|
63 |
rows = {}
|
@@ -87,38 +77,30 @@ def get_text(result, blocks_map):
|
|
87 |
text += 'X '
|
88 |
return text.strip()
|
89 |
|
90 |
-
def
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
def process_file_and_generate_csv(input_file):
|
95 |
-
# Initialize BytesIO object based on the type of input_file
|
96 |
-
if hasattr(input_file, "read"): # If input_file is file-like
|
97 |
-
file_content = BytesIO(input_file.read())
|
98 |
-
elif isinstance(input_file, str): # If input_file is a string path (unlikely in Gradio but included for completeness)
|
99 |
-
with open(input_file, "rb") as f:
|
100 |
-
file_content = BytesIO(f.read())
|
101 |
-
else: # If input_file is neither (e.g., NamedString), it might directly hold the content
|
102 |
-
file_content = BytesIO(input_file)
|
103 |
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
106 |
|
107 |
for i, image in enumerate(images):
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
|
114 |
if response:
|
115 |
blocks = response['Blocks']
|
116 |
blocks_map = {block['Id']: block for block in blocks}
|
117 |
tables = [block for block in blocks if block['BlockType'] == "TABLE"]
|
118 |
-
generate_table_csv(tables, blocks_map,
|
119 |
-
|
120 |
-
|
121 |
-
return
|
122 |
|
123 |
# Gradio Interface
|
124 |
iface = gr.Interface(
|
@@ -128,5 +110,5 @@ iface = gr.Interface(
|
|
128 |
description="Upload a document to extract tables into a CSV file."
|
129 |
)
|
130 |
|
131 |
-
|
132 |
-
iface.launch()
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from datasets.filesystems import S3FileSystem
|
3 |
+
import s3fs
|
4 |
+
import boto3
|
5 |
from pdf2image import convert_from_path
|
6 |
+
import csv
|
7 |
from PIL import Image
|
8 |
import gradio as gr
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# AWS and S3 Initialization with environment variables
|
11 |
aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
|
12 |
aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
|
13 |
region_name = os.getenv('AWS_REGION')
|
14 |
s3_bucket = os.getenv('AWS_BUCKET')
|
15 |
|
16 |
+
s3fs = S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, region=region_name)
|
17 |
+
textract_client = boto3.client('textract', region_name=region_name)
|
18 |
|
19 |
+
def upload_file_to_s3(file_path, bucket, object_name=None):
|
|
|
|
|
|
|
20 |
if object_name is None:
|
21 |
+
object_name = os.path.basename(file_path)
|
|
|
22 |
try:
|
23 |
+
with open(file_path, "rb") as f:
|
24 |
+
s3fs.put(file_path, f"s3://{bucket}/{object_name}")
|
25 |
return object_name
|
26 |
except FileNotFoundError:
|
27 |
print("The file was not found")
|
28 |
return None
|
|
|
|
|
|
|
29 |
|
30 |
+
def process_image(file_path, s3_bucket, textract_client):
|
31 |
+
s3_object_key = upload_file_to_s3(file_path, s3_bucket)
|
|
|
32 |
if not s3_object_key:
|
33 |
return None
|
34 |
|
|
|
35 |
response = textract_client.analyze_document(
|
36 |
Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
|
37 |
FeatureTypes=["TABLES"]
|
38 |
)
|
39 |
return response
|
40 |
|
41 |
+
def generate_table_csv(tables, blocks_map, csv_output_path):
|
42 |
+
with open(csv_output_path, 'w', newline='') as csvfile:
|
43 |
+
writer = csv.writer(csvfile)
|
44 |
+
for table in tables:
|
45 |
+
rows = get_rows_columns_map(table, blocks_map)
|
46 |
+
for row_index, cols in rows.items():
|
47 |
+
row = []
|
48 |
+
for col_index in range(1, max(cols.keys()) + 1):
|
49 |
+
row.append(cols.get(col_index, ""))
|
50 |
+
writer.writerow(row)
|
51 |
|
52 |
def get_rows_columns_map(table_result, blocks_map):
|
53 |
rows = {}
|
|
|
77 |
text += 'X '
|
78 |
return text.strip()
|
79 |
|
80 |
+
def process_file_and_generate_csv(file):
|
81 |
+
file_path = file.name
|
82 |
+
file.save(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
csv_output_path = "/tmp/output.csv"
|
85 |
+
|
86 |
+
if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
|
87 |
+
images = [Image.open(file_path)]
|
88 |
+
else:
|
89 |
+
images = convert_from_path(file_path)
|
90 |
|
91 |
for i, image in enumerate(images):
|
92 |
+
image_path = f"/tmp/image_{i}.jpg"
|
93 |
+
image.save(image_path, 'JPEG')
|
94 |
+
|
95 |
+
response = process_image(image_path, s3_bucket, textract_client)
|
|
|
|
|
96 |
if response:
|
97 |
blocks = response['Blocks']
|
98 |
blocks_map = {block['Id']: block for block in blocks}
|
99 |
tables = [block for block in blocks if block['BlockType'] == "TABLE"]
|
100 |
+
generate_table_csv(tables, blocks_map, csv_output_path)
|
101 |
+
|
102 |
+
os.remove(file_path) # Clean up uploaded file
|
103 |
+
return csv_output_path, "Processing completed successfully!"
|
104 |
|
105 |
# Gradio Interface
|
106 |
iface = gr.Interface(
|
|
|
110 |
description="Upload a document to extract tables into a CSV file."
|
111 |
)
|
112 |
|
113 |
+
if __name__ == "__main__":
|
114 |
+
iface.launch()
|