Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 25, 2024

Commit

f073c65

verified ·

1 Parent(s): 7dbf49f

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -210

app.py CHANGED Viewed

@@ -1,239 +1,102 @@
-# Import spaces first to avoid CUDA initialization issues
-import spaces
-# Then import other libraries
 import torch
 import librosa
-from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
 from gtts import gTTS
 import gradio as gr
 from PIL import Image
 import os
-import base64
-from io import BytesIO
-import io
-import subprocess
 from langdetect import detect
-print("Using GPU for operations when available")
 # Install flash-attn
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
-# Function to safely load pipeline within a GPU-decorated function
-@spaces.GPU
-def load_pipeline(model_name, **kwargs):
-    try:
-        device = 0 if torch.cuda.is_available() else "cpu"
-        return pipeline(model=model_name, device=device, **kwargs)
-    except Exception as e:
-        print(f"Error loading {model_name} pipeline: {e}")
-        return None
-# Load Whisper model for speech recognition within a GPU-decorated function
-@spaces.GPU
-def load_whisper():
-    try:
-        device = 0 if torch.cuda.is_available() else "cpu"
-        processor = WhisperProcessor.from_pretrained("openai/whisper-small")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
-        return processor, model
-    except Exception as e:
-        print(f"Error loading Whisper model: {e}")
-        return None, None
-# Load sarvam-2b for text generation within a GPU-decorated function
-@spaces.GPU
-def load_sarvam():
-    return load_pipeline('sarvamai/sarvam-2b-v0.5')
-# Load Phi-3.5-vision-instruct model
-@spaces.GPU
-def load_vision_model():
-    try:
-        model_id = "microsoft/Phi-3.5-vision-instruct"
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
-        )
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
-        return model, processor
-    except Exception as e:
-        print(f"Error loading vision model: {e}")
-        return None, None
-# Load sarvam-2b for text generation within a GPU-decorated function
-@spaces.GPU
-def load_sarvam():
-    return load_pipeline('sarvamai/sarvam-2b-v0.5')
-# Load Phi-3.5-vision-instruct model
-@spaces.GPU
-def load_vision_model():
-    try:
-        print("Starting to load vision model...")
-        model_id = "microsoft/Phi-3.5-vision-instruct"
-        print(f"Loading model from {model_id}")
-        # Check for CUDA availability
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {device}")
-        # Load model with potential memory optimization
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            torch_dtype=torch.float16,
-            use_flash_attention_2=True,  # Enable if supported
-            device_map="auto",  # Automatically manage model placement
-            low_cpu_mem_usage=True
-        )
-        print("Model loaded successfully")
-        print("Loading processor...")
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
-        print("Processor loaded successfully")
-        return model, processor
-    except ImportError as e:
-        print(f"Error importing required modules: {str(e)}")
-        print("Please ensure all required dependencies are installed.")
-    except RuntimeError as e:
-        print(f"Runtime error (possibly CUDA out of memory): {str(e)}")
-        print("Consider using a smaller model or enabling GPU offloading.")
-    except Exception as e:
-        print(f"Unexpected error in loading vision model: {str(e)}")
-    return None, None
-# Process audio input within a GPU-decorated function
 @spaces.GPU
-def process_audio_input(audio, whisper_processor, whisper_model):
-    if whisper_processor is None or whisper_model is None:
-        return "Error: Speech recognition model is not available. Please type your message instead."
     try:
         audio, sr = librosa.load(audio, sr=16000)
-        input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
         predicted_ids = whisper_model.generate(input_features)
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
         return transcription
     except Exception as e:
         return f"Error processing audio: {str(e)}. Please type your message instead."
-# Updated process_image_input function
-@spaces.GPU
 @spaces.GPU
-def process_image_input(image, text_prompt, vision_model, processor):
-    if vision_model is None or processor is None:
-        return "Error: Vision model is not available."
     try:
-        # Convert image to base64
-        if isinstance(image, Image.Image):
-            buffered = BytesIO()
-            image.save(buffered, format="PNG")
-            img_str = base64.b64encode(buffered.getvalue()).decode()
-        else:
-            # If it's not a PIL Image, assume it's a file path
-            with open(image, "rb") as image_file:
-                img_str = base64.b64encode(image_file.read()).decode()
-        # Format the input with image tag
-        formatted_prompt = f"{text_prompt}\n<image>data:image/png;base64,{img_str}</image>"
-        # Process the formatted prompt
-        inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
-        # Generate text
-        with torch.no_grad():
-            outputs = vision_model.generate(
-                **inputs,
-                max_new_tokens=100,
-                do_sample=True,
-                top_k=50,
-                top_p=0.95,
-                num_return_sequences=1
-            )
-        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-        return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
-# Generate response within a GPU-decorated function
-@spaces.GPU
-def generate_response(transcription, sarvam_pipe):
-    if sarvam_pipe is None:
-        return "Error: Text generation model is not available."
     try:
-        # Generate response using the sarvam-2b model
         response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
         return response
     except Exception as e:
         return f"Error generating response: {str(e)}"
-# Text-to-speech function
 def text_to_speech(text, lang='hi'):
     try:
-        # Use a better TTS engine for Indic languages
-        if lang in ['hi', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:
-            # You might want to use a different TTS library here
-            # For example, you could use the Google Cloud Text-to-Speech API
-            # or a specialized Indic language TTS library
-            # This is a placeholder for a better Indic TTS solution
-            tts = gTTS(text=text, lang=lang, tld='co.in')  # Use Indian TLD
-        else:
-            tts = gTTS(text=text, lang=lang)
         tts.save("response.mp3")
         return "response.mp3"
     except Exception as e:
         print(f"Error in text-to-speech: {str(e)}")
         return None
-# Improved language detection function
-def detect_language(text):
-    lang_codes = {
-        'bn': 'Bengali', 'gu': 'Gujarati', 'hi': 'Hindi', 'kn': 'Kannada',
-        'ml': 'Malayalam', 'mr': 'Marathi', 'or': 'Oriya', 'pa': 'Punjabi',
-        'ta': 'Tamil', 'te': 'Telugu', 'en': 'English'
-    }
-    try:
-        detected_lang = detect(text)
-        return detected_lang if detected_lang in lang_codes else 'en'
-    except:
-        # Fallback to simple script-based detection
-        for code, lang in lang_codes.items():
-            if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text):  # Devanagari script
-                return 'hi'
-        return 'en'  # Default to English if no Indic script is detected
 @spaces.GPU
 def indic_vision_assistant(input_type, audio_input, text_input, image_input):
     try:
-        whisper_processor, whisper_model = load_whisper()
-        sarvam_pipe = load_sarvam()
-        vision_model, processor = load_vision_model()
         if input_type == "audio" and audio_input is not None:
-            transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
-            # Use a default prompt if no text input is provided
             text_prompt = text_input if text_input else "Describe this image in detail."
-            transcription = process_image_input(image_input, text_prompt, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
-        response = generate_response(transcription, sarvam_pipe)
-        lang = detect_language(response)
         audio_response = text_to_speech(response, lang)
         return transcription, response, audio_response
@@ -241,7 +104,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {
@@ -266,33 +128,7 @@ body {
 #custom-header h1 .pink {
     color: #f472b6;
 }
-#custom-header h2 {@spaces.GPU
-def indic_vision_assistant(input_type, audio_input, text_input, image_input):
-    try:
-        whisper_processor, whisper_model = load_whisper()
-        sarvam_pipe = load_sarvam()
-        vision_model, processor = load_vision_model()
-        if input_type == "audio" and audio_input is not None:
-            transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
-        elif input_type == "text" and text_input:
-            transcription = text_input
-        elif input_type == "image" and image_input is not None:
-            # Use a default prompt if no text input is provided
-            text_prompt = text_input if text_input else "Describe this image in detail."
-            transcription = process_image_input(image_input, text_prompt, vision_model, processor)
-        else:
-            return "Please provide either audio, text, or image input.", "No input provided.", None
-        response = generate_response(transcription, sarvam_pipe)
-        lang = detect_language(response)
-        audio_response = text_to_speech(response, lang)
-        return transcription, response, audio_response
-    except Exception as e:
-        error_message = f"An error occurred: {str(e)}"
-        return error_message, error_message, None
     font-size: 1.5rem;
     color: #94a3b8;
 }
@@ -371,7 +207,8 @@ custom_suggestions = """
     </div>
 </div>
 """
-# Update the Gradio interface to allow text input for image processing
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
@@ -405,5 +242,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
         outputs=[output_transcription, output_response, output_audio]
     )
     gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 # Launch the app
 iface.launch()

 import torch
 import librosa
+from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration
 from gtts import gTTS
 import gradio as gr
+import spaces
 from PIL import Image
 import os
 from langdetect import detect
+import subprocess
 # Install flash-attn
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+print("Loading models...")
+# Vision model
+vision_model_id = "microsoft/Phi-3.5-vision-instruct"
+vision_model = AutoModelForCausalLM.from_pretrained(
+    vision_model_id,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    use_flash_attention_2=False
+)
+vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16)
+# Whisper model
+whisper_model_id = "openai/whisper-small"
+whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id)
+whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id)
+# Sarvam model
+sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5')
+print("All models loaded successfully")
 @spaces.GPU
+def process_audio_input(audio):
     try:
+        whisper_model.to('cuda')
         audio, sr = librosa.load(audio, sr=16000)
+        input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda')
         predicted_ids = whisper_model.generate(input_features)
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        whisper_model.to('cpu')
         return transcription
     except Exception as e:
         return f"Error processing audio: {str(e)}. Please type your message instead."
 @spaces.GPU
+def process_image_input(image, text_prompt):
     try:
+        vision_model.to('cuda')
+        messages = [
+            {"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
+        ]
+        prompt = vision_processor.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
+        generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        vision_model.to('cpu')
+        return response
     except Exception as e:
         return f"Error processing image: {str(e)}"
+def generate_response(transcription):
     try:
         response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
         return response
     except Exception as e:
         return f"Error generating response: {str(e)}"
 def text_to_speech(text, lang='hi'):
     try:
+        tts = gTTS(text=text, lang=lang, tld='co.in')
         tts.save("response.mp3")
         return "response.mp3"
     except Exception as e:
         print(f"Error in text-to-speech: {str(e)}")
         return None
 @spaces.GPU
 def indic_vision_assistant(input_type, audio_input, text_input, image_input):
     try:
         if input_type == "audio" and audio_input is not None:
+            transcription = process_audio_input(audio_input)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
             text_prompt = text_input if text_input else "Describe this image in detail."
+            transcription = process_image_input(image_input, text_prompt)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
+        response = generate_response(transcription)
+        lang = detect(response)
         audio_response = text_to_speech(response, lang)
         return transcription, response, audio_response
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {
 #custom-header h1 .pink {
     color: #f472b6;
 }
+#custom-header h2 {
     font-size: 1.5rem;
     color: #94a3b8;
 }
     </div>
 </div>
 """
+# Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     body_background_fill="#0b0f19",
     body_text_color="#e2e8f0",
         outputs=[output_transcription, output_response, output_audio]
     )
     gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 # Launch the app
 iface.launch()