Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,342 Bytes
69617e8 1101b16 0ed5bd6 d652179 1101b16 69617e8 b326bed 1101b16 44b7d9c 0ed5bd6 0b47c5d 69617e8 44b7d9c 69617e8 0ed5bd6 99f3aa9 44b7d9c 0b47c5d d652179 44b7d9c 750ff0f 44b7d9c d652179 0ed5bd6 d652179 0ed5bd6 d652179 69617e8 44b7d9c d92281e 44b7d9c 1101b16 69617e8 d652179 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import google.generativeai as genai
import gradio as gr
import requests
from moviepy.editor import ImageClip, AudioFileClip
# Configure Google Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# Play.ht API keys
API_KEY = os.getenv('PLAY_API_KEY')
USER_ID = os.getenv('PLAY_USER_ID')
# Theme selection
theme = gr.themes.Base(
primary_hue="emerald",
)
# Function to upload image to Gemini and get roasted text
def upload_to_gemini(path, mime_type="image/jpeg"):
file = genai.upload_file(path, mime_type=mime_type)
return file
def generate_roast(image_path):
try:
uploaded_file = upload_to_gemini(image_path)
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash-002",
generation_config=generation_config,
system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast in less than 50 words whatever is given to you in the funniest way possible!",
)
chat_session = model.start_chat(
history=[{"role": "user", "parts": [uploaded_file]}]
)
response = chat_session.send_message("Roast this image!")
return response.text
except Exception as e:
return f"Error generating roast: {e}"
# Function to convert text to speech with Play.ht
def text_to_speech(text):
try:
url = "https://api.play.ht/api/v2/tts/stream"
payload = {
"voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
"output_format": "mp3",
"text": text,
}
headers = {
"accept": "audio/mpeg",
"content-type": "application/json",
"Authorization": API_KEY,
"X-User-ID": USER_ID
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
audio_path = "output_audio.mp3"
with open(audio_path, "wb") as audio_file:
audio_file.write(response.content)
return audio_path
else:
return f"Error generating audio: {response.status_code} - {response.text}"
except Exception as e:
return f"Error generating audio: {e}"
# Function to create video from image and audio
def generate_video(image_path, audio_path):
try:
if audio_path is None or "Error" in audio_path:
return "Error generating video: No valid audio file."
#image_clip = ImageClip(image_path).set_duration(AudioFileClip(audio_path).duration)
#audio_clip = AudioFileClip(audio_path)
#video_clip = image_clip.set_audio(audio_clip)
video_output_path = gr.make_waveform((16000, audio_path), bg_image=image_path)
#video_clip.write_videofile(video_output_path, codec="libx264", audio_codec="aac")
return video_output_path
except Exception as e:
return f"Error generating video: {e}"
# Gradio Interface
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# Image to Text-to-Speech Roasting App")
gr.Markdown("Upload an image, and the AI will roast it, convert the roast to audio, and create a video output.")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload Image")
# Output areas
output_text = gr.Textbox(label="Roast Text")
audio_output = gr.Audio(label="Roast Audio")
video_output = gr.Video(label="Roast Video")
# Button to generate roast text
roast_button = gr.Button("Generate Roast Text")
roast_button.click(generate_roast, inputs=image_input, outputs=output_text)
# Button to generate audio from roast text
audio_button = gr.Button("Generate Roast Audio")
audio_button.click(text_to_speech, inputs=output_text, outputs=audio_output)
# Button to generate video from image and audio
video_button = gr.Button("Generate Roast Video")
video_button.click(generate_video, inputs=[image_input, audio_output], outputs=video_output)
# Launch the app
demo.launch(debug=True)
|