Spaces:
Running
on
T4
Running
on
T4
File size: 5,352 Bytes
2e2dda5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
from PyPDF2 import PdfWriter, PdfReader
import re
import shutil
import streamlit as st
file_content = {}
parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
# if os.path.isdir(parent_dirname+"/split_pdf"):
# shutil.rmtree(parent_dirname+"/split_pdf")
# os.mkdir(parent_dirname+"/split_pdf")
# if os.path.isdir(parent_dirname+"/split_pdf_csv"):
# shutil.rmtree(parent_dirname+"/split_pdf_csv")
# os.mkdir(parent_dirname+"/split_pdf_csv")
def get_rows_columns_map(table_result, blocks_map):
rows = {}
#scores = []
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get confidence score
#scores.append(str(cell['Confidence']))
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows#, scores
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
if "," in word['Text'] and word['Text'].replace(",", "").isnumeric():
text += '"' + word['Text'] + '"' +' '
else:
text += word['Text'] +' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
return text
def split_pages(file_name):
inputpdf = PdfReader(open(file_name, "rb"))
file_name_short = re.sub('[^A-Za-z0-9]+', '', (file_name.split("/")[-1].split(".")[0]).lower())
for i in range(len(inputpdf.pages)):
output = PdfWriter()
output.add_page(inputpdf.pages[i])
split_file = parent_dirname+"/split_pdf/"+file_name_short+"%s.pdf" % i
with open(split_file, "wb") as outputStream:
output.write(outputStream)
table_csv = get_table_csv_results(split_file)
if(table_csv != "<b> NO Table FOUND </b>"):
output_file = parent_dirname+"/split_pdf_csv/"+file_name_short+"%s.csv" % i
file_content[output_file] = table_csv
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
return file_content
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
#print('Image loaded', file_name)
# process using image bytes
# get the results
#session = boto3.Session(profile_name='profile-name')
client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'],
aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1')
# {'S3Object': {
# 'Bucket': 'ml-search-app-access',
# 'Name': 'covid19_ie_removed.pdf'
# }}
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
# Get the text blocks
blocks=response['Blocks']
#pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = ''#Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
col_indices = len(cols.items())
csv += text.strip()+"`" #'{}'.format(text) + ","
csv += '\n'
# csv += '\n\n Confidence Scores % (Table Cell) \n'
# cols_count = 0
# for score in scores:
# cols_count += 1
# csv += score + ","
# if cols_count == col_indices:
# csv += '\n'
# cols_count = 0
csv += '\n\n\n'
return csv
def main_(file_name):
table_csv = split_pages(file_name)
#print(table_csv)
return table_csv
# if __name__ == "__main__":
# file_name = "/home/ubuntu/covid19_ie_removed.pdf"
# main(file_name)
|