import os import json import boto3 import io from io import BytesIO import sys from pprint import pprint from PyPDF2 import PdfWriter, PdfReader import re import shutil import streamlit as st file_content = {} parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1]) # if os.path.isdir(parent_dirname+"/split_pdf"): # shutil.rmtree(parent_dirname+"/split_pdf") # os.mkdir(parent_dirname+"/split_pdf") # if os.path.isdir(parent_dirname+"/split_pdf_csv"): # shutil.rmtree(parent_dirname+"/split_pdf_csv") # os.mkdir(parent_dirname+"/split_pdf_csv") def get_rows_columns_map(table_result, blocks_map): rows = {} #scores = [] for relationship in table_result['Relationships']: if relationship['Type'] == 'CHILD': for child_id in relationship['Ids']: cell = blocks_map[child_id] if cell['BlockType'] == 'CELL': row_index = cell['RowIndex'] col_index = cell['ColumnIndex'] if row_index not in rows: # create new row rows[row_index] = {} # get confidence score #scores.append(str(cell['Confidence'])) # get the text value rows[row_index][col_index] = get_text(cell, blocks_map) return rows#, scores def get_text(result, blocks_map): text = '' if 'Relationships' in result: for relationship in result['Relationships']: if relationship['Type'] == 'CHILD': for child_id in relationship['Ids']: word = blocks_map[child_id] if word['BlockType'] == 'WORD': if "," in word['Text'] and word['Text'].replace(",", "").isnumeric(): text += '"' + word['Text'] + '"' +' ' else: text += word['Text'] +' ' if word['BlockType'] == 'SELECTION_ELEMENT': if word['SelectionStatus'] =='SELECTED': text += 'X ' return text def split_pages(file_name): inputpdf = PdfReader(open(file_name, "rb")) file_name_short = re.sub('[^A-Za-z0-9]+', '', (file_name.split("/")[-1].split(".")[0]).lower()) for i in range(len(inputpdf.pages)): output = PdfWriter() output.add_page(inputpdf.pages[i]) split_file = parent_dirname+"/split_pdf/"+file_name_short+"%s.pdf" % i with open(split_file, "wb") as outputStream: output.write(outputStream) table_csv = get_table_csv_results(split_file) if(table_csv != " NO Table FOUND "): output_file = parent_dirname+"/split_pdf_csv/"+file_name_short+"%s.csv" % i file_content[output_file] = table_csv # replace content with open(output_file, "wt") as fout: fout.write(table_csv) # show the results print('CSV OUTPUT FILE: ', output_file) return file_content def get_table_csv_results(file_name): with open(file_name, 'rb') as file: img_test = file.read() bytes_test = bytearray(img_test) #print('Image loaded', file_name) # process using image bytes # get the results #session = boto3.Session(profile_name='profile-name') client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'], aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1') # {'S3Object': { # 'Bucket': 'ml-search-app-access', # 'Name': 'covid19_ie_removed.pdf' # }} response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES']) # Get the text blocks blocks=response['Blocks'] #pprint(blocks) blocks_map = {} table_blocks = [] for block in blocks: blocks_map[block['Id']] = block if block['BlockType'] == "TABLE": table_blocks.append(block) if len(table_blocks) <= 0: return " NO Table FOUND " csv = '' for index, table in enumerate(table_blocks): csv += generate_table_csv(table, blocks_map, index +1) csv += '\n\n' return csv def generate_table_csv(table_result, blocks_map, table_index): rows = get_rows_columns_map(table_result, blocks_map) table_id = 'Table_' + str(table_index) # get cells. csv = ''#Table: {0}\n\n'.format(table_id) for row_index, cols in rows.items(): for col_index, text in cols.items(): col_indices = len(cols.items()) csv += text.strip()+"`" #'{}'.format(text) + "," csv += '\n' # csv += '\n\n Confidence Scores % (Table Cell) \n' # cols_count = 0 # for score in scores: # cols_count += 1 # csv += score + "," # if cols_count == col_indices: # csv += '\n' # cols_count = 0 csv += '\n\n\n' return csv def main_(file_name): table_csv = split_pages(file_name) #print(table_csv) return table_csv # if __name__ == "__main__": # file_name = "/home/ubuntu/covid19_ie_removed.pdf" # main(file_name)