danial0203 commited on
Commit
dd7dd6e
·
verified ·
1 Parent(s): 382d4fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -60
app.py CHANGED
@@ -1,63 +1,53 @@
1
- import boto3
2
- import csv
3
  import os
4
- from botocore.exceptions import NoCredentialsError
 
 
5
  from pdf2image import convert_from_path
 
6
  from PIL import Image
7
  import gradio as gr
8
- from io import BytesIO
9
- from datasets.filesystems import S3FileSystem
10
- import s3fs
11
-
12
 
13
- # AWS Setup
14
  aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
15
  aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
16
  region_name = os.getenv('AWS_REGION')
17
  s3_bucket = os.getenv('AWS_BUCKET')
18
 
19
- # Initialize s3fs with your AWS credentials
20
- s3_fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs={'region_name': region_name})
21
 
22
- textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
23
-
24
- def upload_file_to_s3(file_content, bucket, object_name=None):
25
- """Uploads file to S3 using s3fs."""
26
  if object_name is None:
27
- raise ValueError("object_name cannot be None")
28
-
29
  try:
30
- with s3_fs.open(f's3://{bucket}/{object_name}', 'wb') as f:
31
- f.write(file_content.read())
32
  return object_name
33
  except FileNotFoundError:
34
  print("The file was not found")
35
  return None
36
- except Exception as e: # Catch broader exceptions that may arise from permissions or AWS issues
37
- print(f"An error occurred: {e}")
38
- return None
39
 
40
-
41
- def process_image(file_content, s3_bucket, textract_client, object_name):
42
- s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
43
  if not s3_object_key:
44
  return None
45
 
46
- # Call Textract
47
  response = textract_client.analyze_document(
48
  Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
49
  FeatureTypes=["TABLES"]
50
  )
51
  return response
52
 
53
- def generate_table_csv(tables, blocks_map, writer):
54
- for table in tables:
55
- rows = get_rows_columns_map(table, blocks_map)
56
- for row_index, cols in rows.items():
57
- row = []
58
- for col_index in range(1, max(cols.keys()) + 1):
59
- row.append(cols.get(col_index, ""))
60
- writer.writerow(row)
 
 
61
 
62
  def get_rows_columns_map(table_result, blocks_map):
63
  rows = {}
@@ -87,38 +77,30 @@ def get_text(result, blocks_map):
87
  text += 'X '
88
  return text.strip()
89
 
90
- def is_image_file(filename):
91
- image_file_extensions = ['png', 'jpg', 'jpeg']
92
- return any(filename.lower().endswith(ext) for ext in image_file_extensions)
93
-
94
- def process_file_and_generate_csv(input_file):
95
- # Initialize BytesIO object based on the type of input_file
96
- if hasattr(input_file, "read"): # If input_file is file-like
97
- file_content = BytesIO(input_file.read())
98
- elif isinstance(input_file, str): # If input_file is a string path (unlikely in Gradio but included for completeness)
99
- with open(input_file, "rb") as f:
100
- file_content = BytesIO(f.read())
101
- else: # If input_file is neither (e.g., NamedString), it might directly hold the content
102
- file_content = BytesIO(input_file)
103
 
104
- csv_output = BytesIO()
105
- writer = csv.writer(csv_output)
 
 
 
 
106
 
107
  for i, image in enumerate(images):
108
- # Process each image and upload to S3 for Textract processing
109
- image_byte_array = BytesIO()
110
- image.save(image_byte_array, format='JPEG')
111
- image_byte_array.seek(0)
112
-
113
- response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
114
  if response:
115
  blocks = response['Blocks']
116
  blocks_map = {block['Id']: block for block in blocks}
117
  tables = [block for block in blocks if block['BlockType'] == "TABLE"]
118
- generate_table_csv(tables, blocks_map, writer)
119
-
120
- csv_output.seek(0) # Go to the start of the CSV in-memory file
121
- return csv_output, output_csv_path
122
 
123
  # Gradio Interface
124
  iface = gr.Interface(
@@ -128,5 +110,5 @@ iface = gr.Interface(
128
  description="Upload a document to extract tables into a CSV file."
129
  )
130
 
131
- # Launch the interface
132
- iface.launch()
 
 
 
1
  import os
2
+ from datasets.filesystems import S3FileSystem
3
+ import s3fs
4
+ import boto3
5
  from pdf2image import convert_from_path
6
+ import csv
7
  from PIL import Image
8
  import gradio as gr
 
 
 
 
9
 
10
+ # AWS and S3 Initialization with environment variables
11
  aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
12
  aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
13
  region_name = os.getenv('AWS_REGION')
14
  s3_bucket = os.getenv('AWS_BUCKET')
15
 
16
+ s3fs = S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, region=region_name)
17
+ textract_client = boto3.client('textract', region_name=region_name)
18
 
19
+ def upload_file_to_s3(file_path, bucket, object_name=None):
 
 
 
20
  if object_name is None:
21
+ object_name = os.path.basename(file_path)
 
22
  try:
23
+ with open(file_path, "rb") as f:
24
+ s3fs.put(file_path, f"s3://{bucket}/{object_name}")
25
  return object_name
26
  except FileNotFoundError:
27
  print("The file was not found")
28
  return None
 
 
 
29
 
30
+ def process_image(file_path, s3_bucket, textract_client):
31
+ s3_object_key = upload_file_to_s3(file_path, s3_bucket)
 
32
  if not s3_object_key:
33
  return None
34
 
 
35
  response = textract_client.analyze_document(
36
  Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
37
  FeatureTypes=["TABLES"]
38
  )
39
  return response
40
 
41
+ def generate_table_csv(tables, blocks_map, csv_output_path):
42
+ with open(csv_output_path, 'w', newline='') as csvfile:
43
+ writer = csv.writer(csvfile)
44
+ for table in tables:
45
+ rows = get_rows_columns_map(table, blocks_map)
46
+ for row_index, cols in rows.items():
47
+ row = []
48
+ for col_index in range(1, max(cols.keys()) + 1):
49
+ row.append(cols.get(col_index, ""))
50
+ writer.writerow(row)
51
 
52
  def get_rows_columns_map(table_result, blocks_map):
53
  rows = {}
 
77
  text += 'X '
78
  return text.strip()
79
 
80
+ def process_file_and_generate_csv(file):
81
+ file_path = file.name
82
+ file.save(file_path)
 
 
 
 
 
 
 
 
 
 
83
 
84
+ csv_output_path = "/tmp/output.csv"
85
+
86
+ if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
87
+ images = [Image.open(file_path)]
88
+ else:
89
+ images = convert_from_path(file_path)
90
 
91
  for i, image in enumerate(images):
92
+ image_path = f"/tmp/image_{i}.jpg"
93
+ image.save(image_path, 'JPEG')
94
+
95
+ response = process_image(image_path, s3_bucket, textract_client)
 
 
96
  if response:
97
  blocks = response['Blocks']
98
  blocks_map = {block['Id']: block for block in blocks}
99
  tables = [block for block in blocks if block['BlockType'] == "TABLE"]
100
+ generate_table_csv(tables, blocks_map, csv_output_path)
101
+
102
+ os.remove(file_path) # Clean up uploaded file
103
+ return csv_output_path, "Processing completed successfully!"
104
 
105
  # Gradio Interface
106
  iface = gr.Interface(
 
110
  description="Upload a document to extract tables into a CSV file."
111
  )
112
 
113
+ if __name__ == "__main__":
114
+ iface.launch()