|
import gradio as gr |
|
from PIL import Image |
|
import os |
|
from IndicPhotoOCR.ocr import OCR |
|
from IndicPhotoOCR.theme import Seafoam |
|
from IndicPhotoOCR.utils.helper import detect_para |
|
from transformers import ( |
|
AutoModelForSeq2SeqLM, |
|
AutoTokenizer, |
|
|
|
) |
|
import numpy as np |
|
import torch |
|
from collections import Counter |
|
|
|
def Most_Common(lst): |
|
data = Counter(lst) |
|
return data.most_common(1)[0][0] |
|
|
|
|
|
from IndicTransToolkit import IndicProcessor |
|
|
|
|
|
ocr = OCR(device='cpu',verbose=False) |
|
|
|
def translate(given_str,lang='hindi'): |
|
DEVICE = 'cpu' |
|
model_name = "ai4bharat/indictrans2-en-indic-1B" if lang=="english" else "ai4bharat/indictrans2-indic-en-1B" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
ip = IndicProcessor(inference=True) |
|
|
|
model = model.to(DEVICE) |
|
model.eval() |
|
src_lang, tgt_lang = ("eng_Latn", "hin_Deva") if lang=="english" else ("hin_Deva", "eng_Latn" ) |
|
|
|
batch = ip.preprocess_batch( |
|
[given_str], |
|
src_lang=src_lang, |
|
tgt_lang=tgt_lang, |
|
) |
|
inputs = tokenizer( |
|
batch, |
|
truncation=True, |
|
padding="longest", |
|
return_tensors="pt", |
|
return_attention_mask=True, |
|
).to(DEVICE) |
|
with torch.no_grad(): |
|
generated_tokens = model.generate( |
|
**inputs, |
|
use_cache=True, |
|
min_length=0, |
|
max_length=256, |
|
num_beams=5, |
|
num_return_sequences=1, |
|
) |
|
|
|
|
|
with tokenizer.as_target_tokenizer(): |
|
generated_tokens = tokenizer.batch_decode( |
|
generated_tokens.detach().cpu().tolist(), |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=True, |
|
) |
|
translation = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0] |
|
return translation |
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_image(image): |
|
""" |
|
Processes the uploaded image for text detection and recognition. |
|
- Detects bounding boxes in the image |
|
- Draws bounding boxes on the image and identifies script in each detected area |
|
- Recognizes text in each cropped region and returns the annotated image and recognized text |
|
|
|
Parameters: |
|
image (PIL.Image): The input image to be processed. |
|
|
|
Returns: |
|
tuple: A PIL.Image with bounding boxes and a string of recognized text. |
|
""" |
|
|
|
|
|
image_path = "input_image.jpg" |
|
image.save(image_path) |
|
|
|
|
|
detections = ocr.detect(image_path) |
|
|
|
|
|
ocr.visualize_detection(image_path, detections, save_path="output_image.png") |
|
|
|
|
|
output_image = Image.open("output_image.png") |
|
|
|
|
|
recognized_texts = {} |
|
pil_image = Image.open(image_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
langs = [] |
|
for id, bbox in enumerate(detections): |
|
|
|
script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox) |
|
|
|
|
|
x1 = min([bbox[i][0] for i in range(len(bbox))]) |
|
y1 = min([bbox[i][1] for i in range(len(bbox))]) |
|
x2 = max([bbox[i][0] for i in range(len(bbox))]) |
|
y2 = max([bbox[i][1] for i in range(len(bbox))]) |
|
|
|
if script_lang: |
|
recognized_text = ocr.recognise(cropped_path, script_lang) |
|
recognized_texts[f"img_{id}"] = {"txt": recognized_text, "bbox": [x1, y1, x2, y2]} |
|
langs.append(script_lang) |
|
|
|
|
|
|
|
string = detect_para(recognized_texts) |
|
|
|
recognized_texts_combined = '\n'.join([' '.join(line) for line in string]) |
|
recognized_texts_combined = translate(recognized_texts_combined,Most_Common(langs)) |
|
|
|
return output_image, recognized_texts_combined |
|
|
|
|
|
interface_html = """ |
|
<div style="text-align: left; padding: 10px;"> |
|
<div style="background-color: white; padding: 10px; display: inline-block;"> |
|
<img src="https://iitj.ac.in/images/logo/Design-of-New-Logo-of-IITJ-2.png" alt="IITJ Logo" style="width: 100px; height: 100px;"> |
|
</div> |
|
<img src="https://play-lh.googleusercontent.com/_FXSr4xmhPfBykmNJvKvC0GIAVJmOLhFl6RA5fobCjV-8zVSypxX8yb8ka6zu6-4TEft=w240-h480-rw" alt="Bhashini Logo" style="width: 100px; height: 100px; float: right;"> |
|
</div> |
|
""" |
|
|
|
|
|
|
|
|
|
links_html = """ |
|
<div style="text-align: center; padding-top: 20px;"> |
|
<a href="https://github.com/Bhashini-IITJ/IndicPhotoOCR" target="_blank" style="margin-right: 20px; font-size: 18px; text-decoration: none;"> |
|
GitHub Repository |
|
</a> |
|
<a href="https://github.com/Bhashini-IITJ/BharatSceneTextDataset" target="_blank" style="font-size: 18px; text-decoration: none;"> |
|
Dataset Repository |
|
</a> |
|
</div> |
|
""" |
|
|
|
|
|
custom_css = """ |
|
.custom-textbox textarea { |
|
font-size: 20px !important; |
|
} |
|
""" |
|
|
|
|
|
seafoam = Seafoam() |
|
|
|
|
|
examples = [ |
|
["test_images/208.jpg"], |
|
["test_images/1310.jpg"] |
|
] |
|
title = "<h1 style='text-align: center;'>Developed by IITJ</h1>" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil", image_mode="RGB"), |
|
outputs=[ |
|
gr.Image(type="pil", label="Detected Bounding Boxes"), |
|
gr.Textbox(label="Translated Text", elem_classes="custom-textbox") |
|
], |
|
title="Scene Text Translator", |
|
description=title+interface_html+links_html, |
|
theme=seafoam, |
|
css=custom_css, |
|
examples=examples |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.launch() |
|
|