Image2Audio

Sleeping

App Files Files Community

arxivgpt kim commited on Feb 2, 2024

Commit

976dd11

verified ·

1 Parent(s): d3cb1f3

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -163

app.py CHANGED Viewed

@@ -1,9 +1,4 @@
-import gradio as gr
-from gradio_client import Client
-import json
-import re
-from moviepy.editor import VideoFileClip
-from moviepy.audio.AudioClip import AudioClip
 import requests
 def search_pexels_images(query):
@@ -19,136 +14,6 @@ def search_pexels_images(query):
     images_urls = [photo['src']['medium'] for photo in data['photos']]
     return images_urls
-def extract_audio(video_in):
-    input_video = video_in
-    output_audio = 'audio.wav'
-    # Open the video file and extract the audio
-    video_clip = VideoFileClip(input_video)
-    audio_clip = video_clip.audio
-    # Save the audio as a .wav file
-    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files
-    print("Audio extraction complete.")
-    return 'audio.wav'
-def get_caption_from_kosmos(image_in):
-    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
-    kosmos2_result = kosmos2_client.predict(
-        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
-        "Detailed",	# str in 'Description Type' Radio component
-        fn_index=4
-    )
-    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
-    with open(kosmos2_result[1], 'r') as f:
-        data = json.load(f)
-    reconstructed_sentence = []
-    for sublist in data:
-        reconstructed_sentence.append(sublist[0])
-    full_sentence = ' '.join(reconstructed_sentence)
-    #print(full_sentence)
-    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
-    pattern = r'^Describe this image in detail:\s*(.*)$'
-    # Apply the regex pattern to extract the description text.
-    match = re.search(pattern, full_sentence)
-    if match:
-        description = match.group(1)
-        print(description)
-    else:
-        print("Unable to locate valid description.")
-    # Find the last occurrence of "."
-    last_period_index = description.rfind('.')
-    # Truncate the string up to the last period
-    truncated_caption = description[:last_period_index + 1]
-    # print(truncated_caption)
-    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
-    return truncated_caption
-def get_caption(image_in):
-    client = Client("https://vikhyatk-moondream1.hf.space/")
-    result = client.predict(
-		image_in,	# filepath  in 'image' Image component
-		"Describe precisely the image in one sentence.",	# str  in 'Question' Textbox component
-		api_name="/answer_question"
-    )
-    print(result)
-    return result
-def get_magnet(prompt):
-    amended_prompt = f"{prompt}"
-    print(amended_prompt)
-    client = Client("https://fffiloni-magnet.hf.space/")
-    result = client.predict(
-        "facebook/audio-magnet-medium",	# Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium']  in 'Model' Radio component
-        "",	# str  in 'Model Path (custom models)' Textbox component
-        amended_prompt,	# str  in 'Input Text' Textbox component
-        3,	# float  in 'Temperature' Number component
-        0.9,	# float  in 'Top-p' Number component
-        10,	# float  in 'Max CFG coefficient' Number component
-        1,	# float  in 'Min CFG coefficient' Number component
-        20,	# float  in 'Decoding Steps (stage 1)' Number component
-        10,	# float  in 'Decoding Steps (stage 2)' Number component
-        10,	# float  in 'Decoding Steps (stage 3)' Number component
-        10,	# float  in 'Decoding Steps (stage 4)' Number component
-        "prod-stride1 (new!)",	# Literal['max-nonoverlap', 'prod-stride1 (new!)']  in 'Span Scoring' Radio component
-        api_name="/predict_full"
-    )
-    print(result)
-    return result[1]
-def get_audioldm(prompt):
-    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
-    result = client.predict(
-        prompt,	# str in 'Input text' Textbox component
-        "Low quality. Music.",	# str in 'Negative prompt' Textbox component
-        10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
-        3.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
-        45,	# int | float in 'Seed' Number component
-        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
-        fn_index=1
-    )
-    print(result)
-    audio_result = extract_audio(result)
-    return audio_result
-def get_audiogen(prompt):
-    client = Client("https://fffiloni-audiogen.hf.space/")
-    result = client.predict(
-        prompt,
-        10,
-        api_name="/infer"
-    )
-    return result
-def infer(image_in, chosen_model):
-    caption = get_caption(image_in)
-    if chosen_model == "MAGNet" :
-        magnet_result = get_magnet(caption)
-        return magnet_result
-    elif chosen_model == "AudioLDM-2" :
-        audioldm_result = get_audioldm(caption)
-        return audioldm_result
-    elif chosen_model == "AudioGen" :
-        audiogen_result = get_audiogen(caption)
-        return audiogen_result
-css="""
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-}
-"""
 def show_search_results(query):
     images_urls = search_pexels_images(query)
@@ -167,30 +32,3 @@ with gr.Blocks() as app:
         outputs=images_output
     )
 app.launch(debug=True)
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.HTML("""
-        <h2 style="text-align: center;">
-            Image to SFX
-        </h2>
-        <p style="text-align: center;">
-            Compare MAGNet, AudioLDM2 and AudioGen sound effects generation from image caption.
-        </p>
-        """)
-        with gr.Column():
-            image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
-            with gr.Row():
-                chosen_model = gr.Radio(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen"], value="AudioLDM-2")
-                submit_btn = gr.Button("Submit")
-        with gr.Column():
-            audio_o = gr.Audio(label="Audio output")
-    submit_btn.click(
-        fn=infer,
-        inputs=[image_in, chosen_model],
-        outputs=[audio_o]
-    )
-demo.queue(max_size=10).launch(debug=True)

 import requests
 def search_pexels_images(query):
     images_urls = [photo['src']['medium'] for photo in data['photos']]
     return images_urls
 def show_search_results(query):
     images_urls = search_pexels_images(query)
         outputs=images_output
     )
 app.launch(debug=True)