Tayel commited on
Commit
9458e64
·
1 Parent(s): 3aedc1b

Added app.py and requirements.txt for the Image Captioning project

Browse files
Files changed (2) hide show
  1. app.py +81 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries and modules
2
+ from transformers import BlipProcessor, BlipForConditionalGeneration, MBartForConditionalGeneration, MBart50Tokenizer
3
+ from gtts import gTTS
4
+ from PIL import Image
5
+ import gradio as gr
6
+
7
+ # Pipeline Component 1: Image Captioning Model
8
+ class ImageToText:
9
+ def __init__(self):
10
+ """Initializes the BLIP model for image captioning."""
11
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
12
+ self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
13
+ print("BLIP Image Captioning Model Loaded")
14
+
15
+ def generate_caption(self, img):
16
+ """Generates a caption for the given image."""
17
+ inputs = self.processor(images=img, return_tensors="pt")
18
+ generated_ids = self.model.generate(**inputs)
19
+ caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
20
+ return caption
21
+
22
+ # Pipeline Component 2: Arabic Translation Model (mBART)
23
+ class ArabicTranslator:
24
+ def __init__(self):
25
+ """Initializes the mBART model for English to Arabic translation."""
26
+ self.tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
27
+ self.model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
28
+ print("mBART Arabic Translation Model Loaded")
29
+
30
+ def translate(self, text):
31
+ """Translates the given English text to Arabic."""
32
+ inputs = self.tokenizer(text, return_tensors="pt", src_lang="en_XX")
33
+ translated = self.model.generate(inputs["input_ids"], forced_bos_token_id=self.tokenizer.lang_code_to_id["ar_AR"])
34
+ translated_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
35
+ return translated_text
36
+
37
+ # Pipeline Component 3: Text-to-Speech Model (gTTS)
38
+ class TextToSpeech:
39
+ def __init__(self, lang='ar'):
40
+ """Initializes the Text-to-Speech system for Arabic."""
41
+ self.lang = lang
42
+
43
+ def generate_audio(self, text):
44
+ """Generates audio from the given Arabic text."""
45
+ tts = gTTS(text=text, lang=self.lang, slow=False)
46
+ audio_file_path = 'output.mp3'
47
+ tts.save(audio_file_path)
48
+ return audio_file_path
49
+
50
+ # Main Pipeline Integration
51
+ class ImageToArabicSpeechPipeline:
52
+ def __init__(self):
53
+ """Initializes all pipeline components."""
54
+ self.caption_model = ImageToText()
55
+ self.translation_model = ArabicTranslator()
56
+ self.tts_model = TextToSpeech()
57
+
58
+ def process_image(self, img):
59
+ """Processes the image, generates a caption, translates it to Arabic, and converts it to speech."""
60
+ caption = self.caption_model.generate_caption(img)
61
+ translated_text = self.translation_model.translate(caption)
62
+ audio_file = self.tts_model.generate_audio(translated_text)
63
+ return caption, translated_text, audio_file
64
+
65
+ # Gradio Interface Setup
66
+ def demo(image):
67
+ """Function to be used in Gradio for processing the image and returning caption, translation, and audio."""
68
+ img = Image.open(image)
69
+ pipeline = ImageToArabicSpeechPipeline()
70
+ caption, translated_text, audio_file = pipeline.process_image(img)
71
+ return caption, translated_text, audio_file
72
+
73
+ # Define Gradio Interface
74
+ iface = gr.Interface(
75
+ fn=demo,
76
+ inputs=gr.Image(type="filepath"),
77
+ outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Translated Text"), gr.Audio(label="Generated Speech")]
78
+ )
79
+
80
+ # Launch the Gradio Interface
81
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gtts
2
+ gradio
3
+ transformers