File size: 6,178 Bytes
43c7b9d 7cd21ae 43c7b9d 7cd21ae 43c7b9d 7b19271 43c7b9d 7b19271 43c7b9d 7b19271 43c7b9d 7b19271 43c7b9d 7b19271 43c7b9d 4c9893f 43c7b9d 7b19271 43c7b9d 4c9893f 43c7b9d 6b49c3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Import needed library
from PIL import Image
import gradio as gr
import torch
import requests
import re
from transformers import pipeline,BlipProcessor, BlipForConditionalGeneration, TrOCRProcessor, VisionEncoderDecoderModel
# load image examples
img_urls_1 = ['https://i.pinimg.com/564x/f7/f5/bd/f7f5bd929e05a852ff423e6e02deea54.jpg', 'https://i.pinimg.com/564x/b4/29/69/b4296962cb76a72354a718109835caa3.jpg',
'https://i.pinimg.com/564x/f2/68/8e/f2688eccd6dd60fdad89ef78950b9ead.jpg']
for idx1, url1 in enumerate(img_urls_1):
image = Image.open(requests.get(url1, stream=True).raw)
image.save(f"image_{idx1}.png")
# load image examples
img_urls_2 = ['https://i.pinimg.com/564x/14/b0/07/14b0075ccd5ea35f7deffc9e5bd6de30.jpg', 'https://newsimg.bbc.co.uk/media/images/45510000/jpg/_45510184_the_writings_466_180.jpg',
'https://cdn.shopify.com/s/files/1/0047/1524/9737/files/Cetaphil_Face_Wash_Ingredients_Optimized.png?v=1680923920', 'https://github.com/kawther12h/Image_Captioning-and-Text_Recognition/blob/main/handText22.jpg?raw=true','https://github.com/kawther12h/Image_Captioning-and-Text_Recognition/blob/main/handText11.jpg?raw=true']
for idx2, url2 in enumerate(img_urls_2):
image = Image.open(requests.get(url2, stream=True).raw)
image.save(f"tx_image_{idx2}.png")
# Load Blip model and processor for captioning
processor_blip = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# Load marefa model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")
def caption_and_translate(img, min_len, max_len):
# Generate English caption
raw_image = Image.open(img).convert('RGB')
inputs_blip = processor_blip(raw_image, return_tensors="pt")
out_blip = model_blip.generate(**inputs_blip, min_length=70, max_length=1000)
english_caption = processor_blip.decode(out_blip[0], skip_special_tokens=True)
# Translate caption from English to Arabic
arabic_caption = translate(english_caption)
arabic_caption = arabic_caption[0]['translation_text']
translated_caption = f'<div dir="rtl">{arabic_caption}</div>'
# Return both captions
return english_caption, translated_caption
# Gradio interface with multiple outputs
img_cap_en_ar = gr.Interface(
fn=caption_and_translate,
inputs=[gr.Image(type='filepath', label='Image')],
#gr.Slider(label='Minimum Length', minimum=1, maximum=500, value=30),
#gr.Slider(label='Maximum Length', minimum=1, maximum=500, value=100)],
outputs=[gr.Textbox(label='English Caption'),
gr.HTML(label='Arabic Caption')],
title='Image Captioning | وصف الصورة',
description="Upload an image to generate an English & Arabic caption | قم برفع صورة وأرسلها ليظهر لك وصف للصورة",
examples =[["image_2.png"]]
)
# Load the model
text_rec = pipeline("image-to-text", model="jinhybr/OCR-Donut-CORD")
# Load MarianMT model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")
# Function to process the image and extract text
def extract_text(image):
# Pass the image to the pipeline
result = text_rec(image)
# Extract the plain text and remove tags
text = result[0]['generated_text']
text = re.sub(r'<[^>]*>', '', text) # Remove all HTML tags
# Translate extracted text from English to Arabic
arabic_text3 = translate(text)
arabic_text3 = arabic_text3[0]['translation_text']
htranslated_text = f'<div dir="rtl">{arabic_text3}</div>'
# Return the extracted text
return text,htranslated_text
# Define the Gradio interface
text_recognition = gr.Interface(
fn=extract_text, # The function that processes the image
inputs=gr.Image(type="pil"), # Input is an image (PIL format)
outputs=[gr.Textbox(label='Extracted text'), gr.HTML(label= 'Translateted of Extracted text ')], # Output is text
title="Text Extraction and Translation | إستخراج النص وترجمتة",
description="Upload an image then Submet to extract text and translate it to Arabic| قم برفع الصورة وأرسلها ليظهر لك النص من الصورة",
examples =[["image_0.png"], ["image_1.png"]]
)
# Load trocr model for handwritten text extraction
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
# Load MarianMT model for translation (English to Arabic)
translate = pipeline("translation",model="marefa-nlp/marefa-mt-en-ar")
def recognize_handwritten_text(image2):
# process and and extract text
pixel_values = processor(images=image2, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Translate extracted text from English to Arabic
arabic_text2 = translate(generated_text)
arabic_text2 = arabic_text2[0]['translation_text']
htranslated_text = f'<div dir="rtl">{arabic_text2}</div>'
# Return the extracted text and translated text
return generated_text, htranslated_text
# Gradio interface with image upload input and text output
handwritten_rec = gr.Interface(
fn=recognize_handwritten_text,
inputs=gr.Image(label="Upload Image"),
outputs=[gr.Textbox(label='English Text'),
gr.HTML(label='Arabic Text')],
title="Handwritten Text Extraction | | إستخراج النص المكتوب بخط اليد وترجمتة",
description="Upload an image then Submet to extract text and translate it to Arabic| قم برفع الصورة وأرسلها ليظهر لك النص من الصورة",
examples =[["tx_image_1.png"], ["tx_image_3.png"]]
)
# Combine all interfaces into a tabbed interface
demo = gr.TabbedInterface([img_cap_en_ar, text_recognition, handwritten_rec], ["Extract_Caption", " Extract_Digital_text", " Extract_HandWritten_text"])
demo.launch(debug=True)
|