Spaces:
PlayHT
/
Running on CPU Upgrade

File size: 4,342 Bytes
69617e8
 
1101b16
0ed5bd6
d652179
1101b16
69617e8
 
 
 
b326bed
 
1101b16
44b7d9c
0ed5bd6
 
 
 
0b47c5d
69617e8
 
 
 
 
44b7d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69617e8
0ed5bd6
99f3aa9
44b7d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b47c5d
d652179
 
44b7d9c
 
 
 
750ff0f
 
 
 
 
44b7d9c
 
 
d652179
0ed5bd6
d652179
0ed5bd6
d652179
69617e8
 
44b7d9c
d92281e
44b7d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1101b16
69617e8
d652179
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import google.generativeai as genai
import gradio as gr
import requests
from moviepy.editor import ImageClip, AudioFileClip

# Configure Google Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Play.ht API keys
API_KEY = os.getenv('PLAY_API_KEY')
USER_ID = os.getenv('PLAY_USER_ID')

# Theme selection
theme = gr.themes.Base(
    primary_hue="emerald",
)

# Function to upload image to Gemini and get roasted text
def upload_to_gemini(path, mime_type="image/jpeg"):
    file = genai.upload_file(path, mime_type=mime_type)
    return file

def generate_roast(image_path):
    try:
        uploaded_file = upload_to_gemini(image_path)
        generation_config = {
            "temperature": 1,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 8192,
            "response_mime_type": "text/plain",
        }
        model = genai.GenerativeModel(
            model_name="gemini-1.5-flash-002",
            generation_config=generation_config,
            system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast in less than 50 words whatever is given to you in the funniest way possible!",
        )
        
        chat_session = model.start_chat(
            history=[{"role": "user", "parts": [uploaded_file]}]
        )
        response = chat_session.send_message("Roast this image!")
        return response.text
    except Exception as e:
        return f"Error generating roast: {e}"

# Function to convert text to speech with Play.ht
def text_to_speech(text):
    try:
        url = "https://api.play.ht/api/v2/tts/stream"
        payload = {
            "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
            "output_format": "mp3",
            "text": text,
        }
        headers = {
            "accept": "audio/mpeg",
            "content-type": "application/json",
            "Authorization": API_KEY,
            "X-User-ID": USER_ID
        }
        
        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 200:
            audio_path = "output_audio.mp3"
            with open(audio_path, "wb") as audio_file:
                audio_file.write(response.content)
            return audio_path
        else:
            return f"Error generating audio: {response.status_code} - {response.text}"
    except Exception as e:
        return f"Error generating audio: {e}"

# Function to create video from image and audio
def generate_video(image_path, audio_path):
    try:
        if audio_path is None or "Error" in audio_path:
            return "Error generating video: No valid audio file."
        
        #image_clip = ImageClip(image_path).set_duration(AudioFileClip(audio_path).duration)
        #audio_clip = AudioFileClip(audio_path)
        #video_clip = image_clip.set_audio(audio_clip)
        video_output_path = gr.make_waveform((16000, audio_path), bg_image=image_path)
        #video_clip.write_videofile(video_output_path, codec="libx264", audio_codec="aac")
        return video_output_path
    except Exception as e:
        return f"Error generating video: {e}"

# Gradio Interface
with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# Image to Text-to-Speech Roasting App")
    gr.Markdown("Upload an image, and the AI will roast it, convert the roast to audio, and create a video output.")
    
    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload Image")
        
    # Output areas
    output_text = gr.Textbox(label="Roast Text")
    audio_output = gr.Audio(label="Roast Audio")
    video_output = gr.Video(label="Roast Video")

    # Button to generate roast text
    roast_button = gr.Button("Generate Roast Text")
    roast_button.click(generate_roast, inputs=image_input, outputs=output_text)

    # Button to generate audio from roast text
    audio_button = gr.Button("Generate Roast Audio")
    audio_button.click(text_to_speech, inputs=output_text, outputs=audio_output)

    # Button to generate video from image and audio
    video_button = gr.Button("Generate Roast Video")
    video_button.click(generate_video, inputs=[image_input, audio_output], outputs=video_output)

# Launch the app
demo.launch(debug=True)