Spaces:

AuditEdge
/

optimised-ocr

Running

File size: 4,087 Bytes

81e13bb

import os
import pandas as pd

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./titanium-scope-436311-t3-966373f5aa2f.json"




def run_tesseract_on_image(image_path):  # -> tsv output path
  print("image_path",image_path)
  image_name = os.path.basename(image_path)
  image_name = image_name[:image_name.find('.')]
  error_code = os.system(f'''
  tesseract "{image_path}" "/content/{image_name}" -l eng tsv
  ''')
  if not error_code:
    return f"/content/{image_name}.tsv"
  else:
    raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')


def clean_tesseract_output(tsv_output_path):
  print("tsv_output_path",tsv_output_path)
  ocr_df = pd.read_csv(tsv_output_path, sep='\t')
  ocr_df = ocr_df.dropna()
  ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
  text_output = ' '.join(ocr_df.text.tolist())
  words = []
  for index, row in ocr_df.iterrows():
    word = {}
    origin_box = [row['left'], row['top'], row['left'] +
                  row['width'], row['top']+row['height']]
    word['word_text'] = row['text']
    word['word_box'] = origin_box
    words.append(word)
  return words




def detect_text(path):
    print("this is path:",path)
  
    """Detects text in the file."""
    from google.cloud import vision
    client = vision.ImageAnnotatorClient()
    with open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    print("Texts:")
    list_of_dict = []
    for text in texts[1:]:
        data_dic = {}
        print(f'\n"{text.description}"')
        data_dic["word_text"] = text.description

        vertices_list = [[int(vertex.x),int(vertex.y)] for vertex in text.bounding_poly.vertices]
        print("vertices_list",vertices_list)

        
        coords = vertices_list 

        sorted_coords = sorted(coords, key=lambda coord: (coord[0] + coord[1]))

        # Top-left is the first in the sorted list (smallest sum of x, y)
        top_left = sorted_coords[0]

        # Bottom-right is the last in the sorted list (largest sum of x, y)
        bottom_right = sorted_coords[-1]

        ls = []
        ls.append(top_left[0])
        ls.append(top_left[1])
        ls.append(bottom_right[0])
        ls.append(bottom_right[1])

        # print(ls)

        # ls = []
        
        # ls.append(vertices_list[0][0])
        # ls.append(vertices_list[0][1])
        # ls.append(vertices_list[2][0])
        # ls.append(vertices_list[2][1])

        data_dic["word_box"] = ls

        list_of_dict.append(data_dic)

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )

    return list_of_dict

  


def prepare_batch_for_inference(image_paths):
  # tesseract_outputs is a list of paths
  inference_batch = dict()
  # tesseract_outputs = [run_tesseract_on_image(
  #     image_path) for image_path in image_paths]
  
  # tesseract_outputs = []
  # for image_path in image_paths:
    
  #   output = run_tesseract_on_image(image_path)
  #   tesseract_outputs.append(output)

  # clean_outputs is a list of lists
  # clean_outputs = [clean_tesseract_output(
      # tsv_path) for tsv_path in tesseract_outputs]

  # clean_outputs = []
  # for tsv_path in tesseract_outputs:
  #   output = clean_tesseract_output(tsv_path)
  #   clean_outputs.append(output)


  clean_outputs = []
  for image_path in image_paths:
    
    output = detect_text(image_path)
    clean_outputs.append(output)

  print("clean_outputs",clean_outputs)

  
  word_lists = [[word['word_text'] for word in clean_output]
                for clean_output in clean_outputs]
  boxes_lists = [[word['word_box'] for word in clean_output]
                 for clean_output in clean_outputs]
  inference_batch = {
      "image_path": image_paths,
      "bboxes": boxes_lists,
      "words": word_lists
  }
  return inference_batch