Spaces:

danial0203
/

PDFtoCSVBetterVersion

Runtime error

App Files Files Community

danial0203 commited on Mar 13, 2024

Commit

dd7dd6e

verified ·

1 Parent(s): 382d4fc

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -60

app.py CHANGED Viewed

@@ -1,63 +1,53 @@
-import boto3
-import csv
 import os
-from botocore.exceptions import NoCredentialsError
 from pdf2image import convert_from_path
 from PIL import Image
 import gradio as gr
-from io import BytesIO
-from datasets.filesystems import S3FileSystem
-import s3fs
-# AWS Setup
 aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
 aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
 region_name = os.getenv('AWS_REGION')
 s3_bucket = os.getenv('AWS_BUCKET')
-# Initialize s3fs with your AWS credentials
-s3_fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs={'region_name': region_name})
-textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)
-def upload_file_to_s3(file_content, bucket, object_name=None):
-    """Uploads file to S3 using s3fs."""
     if object_name is None:
-        raise ValueError("object_name cannot be None")
     try:
-        with s3_fs.open(f's3://{bucket}/{object_name}', 'wb') as f:
-            f.write(file_content.read())
         return object_name
     except FileNotFoundError:
         print("The file was not found")
         return None
-    except Exception as e:  # Catch broader exceptions that may arise from permissions or AWS issues
-        print(f"An error occurred: {e}")
-        return None
-def process_image(file_content, s3_bucket, textract_client, object_name):
-    s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
     if not s3_object_key:
         return None
-    # Call Textract
     response = textract_client.analyze_document(
         Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
         FeatureTypes=["TABLES"]
     )
     return response
-def generate_table_csv(tables, blocks_map, writer):
-    for table in tables:
-        rows = get_rows_columns_map(table, blocks_map)
-        for row_index, cols in rows.items():
-            row = []
-            for col_index in range(1, max(cols.keys()) + 1):
-                row.append(cols.get(col_index, ""))
-            writer.writerow(row)
 def get_rows_columns_map(table_result, blocks_map):
     rows = {}
@@ -87,38 +77,30 @@ def get_text(result, blocks_map):
                             text += 'X '
     return text.strip()
-def is_image_file(filename):
-    image_file_extensions = ['png', 'jpg', 'jpeg']
-    return any(filename.lower().endswith(ext) for ext in image_file_extensions)
-def process_file_and_generate_csv(input_file):
-    # Initialize BytesIO object based on the type of input_file
-    if hasattr(input_file, "read"):  # If input_file is file-like
-        file_content = BytesIO(input_file.read())
-    elif isinstance(input_file, str):  # If input_file is a string path (unlikely in Gradio but included for completeness)
-        with open(input_file, "rb") as f:
-            file_content = BytesIO(f.read())
-    else:  # If input_file is neither (e.g., NamedString), it might directly hold the content
-        file_content = BytesIO(input_file)
-    csv_output = BytesIO()
-    writer = csv.writer(csv_output)
     for i, image in enumerate(images):
-        # Process each image and upload to S3 for Textract processing
-        image_byte_array = BytesIO()
-        image.save(image_byte_array, format='JPEG')
-        image_byte_array.seek(0)
-        response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
         if response:
             blocks = response['Blocks']
             blocks_map = {block['Id']: block for block in blocks}
             tables = [block for block in blocks if block['BlockType'] == "TABLE"]
-            generate_table_csv(tables, blocks_map, writer)
-    csv_output.seek(0)  # Go to the start of the CSV in-memory file
-    return csv_output, output_csv_path
 # Gradio Interface
 iface = gr.Interface(
@@ -128,5 +110,5 @@ iface = gr.Interface(
     description="Upload a document to extract tables into a CSV file."
 )
-# Launch the interface
-iface.launch()

 import os
+from datasets.filesystems import S3FileSystem
+import s3fs
+import boto3
 from pdf2image import convert_from_path
+import csv
 from PIL import Image
 import gradio as gr
+# AWS and S3 Initialization with environment variables
 aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
 aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
 region_name = os.getenv('AWS_REGION')
 s3_bucket = os.getenv('AWS_BUCKET')
+s3fs = S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, region=region_name)
+textract_client = boto3.client('textract', region_name=region_name)
+def upload_file_to_s3(file_path, bucket, object_name=None):
     if object_name is None:
+        object_name = os.path.basename(file_path)
     try:
+        with open(file_path, "rb") as f:
+            s3fs.put(file_path, f"s3://{bucket}/{object_name}")
         return object_name
     except FileNotFoundError:
         print("The file was not found")
         return None
+def process_image(file_path, s3_bucket, textract_client):
+    s3_object_key = upload_file_to_s3(file_path, s3_bucket)
     if not s3_object_key:
         return None
     response = textract_client.analyze_document(
         Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
         FeatureTypes=["TABLES"]
     )
     return response
+def generate_table_csv(tables, blocks_map, csv_output_path):
+    with open(csv_output_path, 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        for table in tables:
+            rows = get_rows_columns_map(table, blocks_map)
+            for row_index, cols in rows.items():
+                row = []
+                for col_index in range(1, max(cols.keys()) + 1):
+                    row.append(cols.get(col_index, ""))
+                writer.writerow(row)
 def get_rows_columns_map(table_result, blocks_map):
     rows = {}
                             text += 'X '
     return text.strip()
+def process_file_and_generate_csv(file):
+    file_path = file.name
+    file.save(file_path)
+    csv_output_path = "/tmp/output.csv"
+    if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
+        images = [Image.open(file_path)]
+    else:
+        images = convert_from_path(file_path)
     for i, image in enumerate(images):
+        image_path = f"/tmp/image_{i}.jpg"
+        image.save(image_path, 'JPEG')
+        response = process_image(image_path, s3_bucket, textract_client)
         if response:
             blocks = response['Blocks']
             blocks_map = {block['Id']: block for block in blocks}
             tables = [block for block in blocks if block['BlockType'] == "TABLE"]
+            generate_table_csv(tables, blocks_map, csv_output_path)
+    os.remove(file_path)  # Clean up uploaded file
+    return csv_output_path, "Processing completed successfully!"
 # Gradio Interface
 iface = gr.Interface(
     description="Upload a document to extract tables into a CSV file."
 )
+if __name__ == "__main__":
+    iface.launch()