Image2Audio

Sleeping

App Files Files Community

arxivgpt kim commited on Feb 2, 2024

Commit

3e510b3

verified ·

1 Parent(s): a8d74c6

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -19

app.py CHANGED Viewed

@@ -1,35 +1,183 @@
 import gradio as gr
 import requests
 def search_pexels_images(query):
-    API_KEY = '5woz23MGx1QrSY0WHFb0BRi29JvbXPu97Hg0xnklYgHUI8G0w23FKH62'  # Pexels API 키를 여기에 입력하세요.
-    # 한 페이지당 검색 결과 수를 10개로 설정
     url = f"https://api.pexels.com/v1/search?query={query}&per_page=80"
-    headers = {
-        "Authorization": API_KEY
-    }
     response = requests.get(url, headers=headers)
     data = response.json()
     images_urls = [photo['src']['medium'] for photo in data['photos']]
     return images_urls
 def show_search_results(query):
     images_urls = search_pexels_images(query)
     return images_urls
-with gr.Blocks() as app:
-    with gr.Column():
-        gr.Markdown("### Image SFX Generator with Pexels Image Search")
-        search_query = gr.Textbox(label="사진 검색")
-        search_btn = gr.Button("검색")
-        images_output = gr.Gallery(label="검색 결과 이미지")
-    search_btn.click(
-        fn=show_search_results,
-        inputs=search_query,
-        outputs=images_output
     )
-app.launch(debug=True)

 import gradio as gr
 import requests
+import gradio as gr
+from gradio_client import Client
+import json
+import re
+from moviepy.editor import VideoFileClip
+from moviepy.audio.AudioClip import AudioClip
+# Pexels 이미지 검색 함수
 def search_pexels_images(query):
+    API_KEY = '5woz23MGx1QrSY0WHFb0BRi29JvbXPu97Hg0xnklYgHUI8G0w23FKH62'
     url = f"https://api.pexels.com/v1/search?query={query}&per_page=80"
+    headers = {"Authorization": API_KEY}
     response = requests.get(url, headers=headers)
     data = response.json()
     images_urls = [photo['src']['medium'] for photo in data['photos']]
     return images_urls
+# Pexels 이미지 검색 결과 표시 함수
 def show_search_results(query):
     images_urls = search_pexels_images(query)
     return images_urls
+def extract_audio(video_in):
+    input_video = video_in
+    output_audio = 'audio.wav'
+    # Open the video file and extract the audio
+    video_clip = VideoFileClip(input_video)
+    audio_clip = video_clip.audio
+    # Save the audio as a .wav file
+    audio_clip.write_audiofile(output_audio, fps=44100)  # Use 44100 Hz as the sample rate for .wav files
+    print("Audio extraction complete.")
+    return 'audio.wav'
+def get_caption_from_kosmos(image_in):
+    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
+    kosmos2_result = kosmos2_client.predict(
+        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
+        "Detailed",	# str in 'Description Type' Radio component
+        fn_index=4
     )
+    print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    with open(kosmos2_result[1], 'r') as f:
+        data = json.load(f)
+    reconstructed_sentence = []
+    for sublist in data:
+        reconstructed_sentence.append(sublist[0])
+    full_sentence = ' '.join(reconstructed_sentence)
+    #print(full_sentence)
+    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
+    pattern = r'^Describe this image in detail:\s*(.*)$'
+    # Apply the regex pattern to extract the description text.
+    match = re.search(pattern, full_sentence)
+    if match:
+        description = match.group(1)
+        print(description)
+    else:
+        print("Unable to locate valid description.")
+    # Find the last occurrence of "."
+    last_period_index = description.rfind('.')
+    # Truncate the string up to the last period
+    truncated_caption = description[:last_period_index + 1]
+    # print(truncated_caption)
+    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return truncated_caption
+def get_caption(image_in):
+    client = Client("https://vikhyatk-moondream1.hf.space/")
+    result = client.predict(
+		image_in,	# filepath  in 'image' Image component
+		"Describe precisely the image in one sentence.",	# str  in 'Question' Textbox component
+		api_name="/answer_question"
+    )
+    print(result)
+    return result
+def get_magnet(prompt):
+    amended_prompt = f"{prompt}"
+    print(amended_prompt)
+    client = Client("https://fffiloni-magnet.hf.space/")
+    result = client.predict(
+        "facebook/audio-magnet-medium",	# Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium']  in 'Model' Radio component
+        "",	# str  in 'Model Path (custom models)' Textbox component
+        amended_prompt,	# str  in 'Input Text' Textbox component
+        3,	# float  in 'Temperature' Number component
+        0.9,	# float  in 'Top-p' Number component
+        10,	# float  in 'Max CFG coefficient' Number component
+        1,	# float  in 'Min CFG coefficient' Number component
+        20,	# float  in 'Decoding Steps (stage 1)' Number component
+        10,	# float  in 'Decoding Steps (stage 2)' Number component
+        10,	# float  in 'Decoding Steps (stage 3)' Number component
+        10,	# float  in 'Decoding Steps (stage 4)' Number component
+        "prod-stride1 (new!)",	# Literal['max-nonoverlap', 'prod-stride1 (new!)']  in 'Span Scoring' Radio component
+        api_name="/predict_full"
+    )
+    print(result)
+    return result[1]
+def get_audioldm(prompt):
+    client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
+    result = client.predict(
+        prompt,	# str in 'Input text' Textbox component
+        "Low quality. Music.",	# str in 'Negative prompt' Textbox component
+        10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
+        3.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
+        45,	# int | float in 'Seed' Number component
+        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
+        fn_index=1
+    )
+    print(result)
+    audio_result = extract_audio(result)
+    return audio_result
+def get_audiogen(prompt):
+    client = Client("https://fffiloni-audiogen.hf.space/")
+    result = client.predict(
+        prompt,
+        10,
+        api_name="/infer"
+    )
+    return result
+def infer(image_in, chosen_model):
+    caption = get_caption(image_in)
+    if chosen_model == "MAGNet" :
+        magnet_result = get_magnet(caption)
+        return magnet_result
+    elif chosen_model == "AudioLDM-2" :
+        audioldm_result = get_audioldm(caption)
+        return audioldm_result
+    elif chosen_model == "AudioGen" :
+        audiogen_result = get_audiogen(caption)
+        return audiogen_result
+css="""
+#col-container{
+    margin: 0 auto;
+    max-width: 800px;
+}
+"""
+with gr.Blocks() as app:
+    with gr.Tabs():
+        with gr.TabItem("Image to SFX"):
+            with gr.Column():
+                gr.Markdown("### Image to SFX")
+                image_in = gr.Image(sources=["upload"], type="filepath", label="Image input")
+                chosen_model = gr.Radio(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen"], value="AudioLDM-2")
+                submit_btn = gr.Button("Submit")
+                audio_o = gr.Audio(label="Audio output")
+                submit_btn.click(
+                    fn=infer,
+                    inputs=[image_in, chosen_model],
+                    outputs=audio_o
+                )
+        with gr.TabItem("Pexels Image Search"):
+            with gr.Column():
+                gr.Markdown("### Pexels Image Search")
+                search_query = gr.Textbox(label="사진 검색")
+                search_btn = gr.Button("검색")
+                images_output = gr.Gallery(label="검색 결과 이미지")
+                search_btn.click(
+                    fn=show_search_results,
+                    inputs=search_query,
+                    outputs=images_output
+                )
+app.launch(debug=True)