import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler import torch import cv2 import os import base64 import soundfile as sf import time # --- Set up Models --- # Stable Diffusion for image generation scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="scheduler") pipe = StableDiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-2-1", scheduler=scheduler, torch_dtype=torch.float16 ).to("cuda") # LLaVA for vision-based language understanding tokenizer = AutoTokenizer.from_pretrained("xtuner/llava-llama-3-8b-v1_1-transformers") model = AutoModelForCausalLM.from_pretrained("xtuner/llava-llama-3-8b-v1_1-transformers").to("cuda") # Open-source language model for text generation (e.g., GPT-Neo) gpt_neo_pipe = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B") # Text-to-Speech text_to_speech = pipeline( "text-to-speech", model="espnet/fastspeech2_en_ljspeech" ) # --- Functions --- def process_image(image_base64, chat_history): """Processes an image, sends it to LLaVA, and generates a response.""" # Prepare LLaVA input input_text = f""" {image_base64} \n\nWhat do you see in this image?""" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") # Generate response using LLaVA with torch.no_grad(): outputs = model(**inputs) response = tokenizer.decode(outputs.logits.argmax(-1)[0], skip_special_tokens=True) # Generate speech from the response audio = text_to_speech(response) audio_path = "generated_audio.wav" sf.write(audio_path, audio[0].numpy(), samplerate=22050) # Update chat history chat_history += "You: Image\n" chat_history += "Model: " + response + "\n" return chat_history, audio_path def generate_image(prompt, chat_history): """Generates an image using Stable Diffusion based on a prompt.""" image = pipe( prompt=prompt, guidance_scale=7.5, num_inference_steps=50, ).images[0] # Update chat history chat_history += "You: " + prompt + "\n" chat_history += "Model: Image\n" return chat_history, image def process_text(text, chat_history): """Processes text, generates a response using GPT-Neo, and generates speech.""" # Generate response using GPT-Neo response = gpt_neo_pipe( text, max_length=100, num_return_sequences=1, )[0]["generated_text"] # Generate speech from the response audio = text_to_speech(response) audio_path = "generated_audio.wav" sf.write(audio_path, audio[0].numpy(), samplerate=22050) # Update chat history chat_history += "You: " + text + "\n" chat_history += "Model: " + response + "\n" return chat_history, audio_path # --- Webcam Capture --- def capture_image(): """Captures a screenshot from the webcam.""" cap = cv2.VideoCapture(0) ret, frame = cap.read() cap.release() image = Image.fromarray(frame) image_bytes = image.convert("RGB").save("captured_image.jpg", "JPEG") with open("captured_image.jpg", "rb") as f: image_base64 = base64.b64encode(f.read()).decode("utf-8") return image_base64 # --- Gradio Interface --- with gr.Blocks() as demo: gr.Markdown("## Llama-LLaVA Vision Speech Assistant") chat_history = gr.Textbox(label="Chat History", lines=10, interactive=False) webcam_output = gr.Image(label="Webcam Feed", interactive=False) image_input = gr.Image(label="Uploaded Image") text_input = gr.Textbox(label="Enter Text") audio_output = gr.Audio(label="Audio Response") # Screenshot button screenshot_button = gr.Button("Capture Screenshot") screenshot_button.click(fn=capture_image, outputs=image_input) # Image processing (LLaVA) image_input.change(fn=process_image, inputs=[image_input, chat_history], outputs=[chat_history, audio_output]) # Text processing (GPT-Neo) text_input.submit(fn=process_text, inputs=[text_input, chat_history], outputs=[chat_history, audio_output]) # Image generation (Stable Diffusion) with gr.Tab("Image Generation"): image_prompt = gr.Textbox(label="Enter image prompt:") image_generation_output = gr.Image(label="Generated Image") generate_image_button = gr.Button("Generate Image") generate_image_button.click( fn=generate_image, inputs=[image_prompt, chat_history], outputs=[chat_history, image_generation_output] ) # Webcam stream with gr.Tab("Webcam"): webcam_output = gr.Image(label="Webcam Feed", source="webcam", interactive=False) # Update webcam image every second def update_webcam(): cap = cv2.VideoCapture(0) while True: ret, frame = cap.read() if not ret: break image = Image.fromarray(frame) yield image time.sleep(1) # Update every second webcam_output.source = update_webcam() demo.launch(share=True)