Spaces:

ruslanmv
/

TextToVideo-Dalle

Paused

App Files Files Community

ruslanmv commited on Jan 17

Commit

351d597

verified ·

1 Parent(s): a1316a9

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -29

app.py CHANGED Viewed

@@ -45,10 +45,11 @@ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
 model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model.to(device)
-print(device)
 def get_output_video(text):
     inputs = tokenizer(text,
                        max_length=1024,
                        truncation=True,
@@ -58,6 +59,7 @@ def get_output_video(text):
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=False)
     plot = list(summary[0].split('.'))
     '''
     The required models will be downloaded to models_root if they are not already there.
@@ -68,15 +70,16 @@ def get_output_video(text):
         '''
     @spaces.GPU(duration=60 * 3)
     def generate_image(
-            is_mega: bool,
-            text: str,
-            seed: int,
-            grid_size: int,
-            top_k: int,
-            image_path: str,
-            models_root: str,
-            fp16: bool,
     ):
         model = MinDalle(
             is_mega=is_mega,
             models_root=models_root,
@@ -94,21 +97,28 @@ def get_output_video(text):
             top_k=top_k,
             is_verbose=True
         )
         return image
     generated_images = []
-    for senten in plot[:-1]:
-        image = generate_image(
-            is_mega=True,
-            text=senten,
-            seed=1,
-            grid_size=1,  # param {type:"integer"}
-            top_k=256,  # param {type:"integer"}
-            image_path='generated',
-            models_root='pretrained',
-            fp16=True, )
-        generated_images.append(image)
     # Step 4- Creation of the subtitles
     sentences = plot[:-1]
@@ -121,6 +131,7 @@ def get_output_video(text):
     for k in range(len(generated_images)):
         subtitles = tokenize.sent_tokenize(sentences[k])
         sub_names.append(subtitles)
     # Step 5- Adding Subtitles to the Images
     def draw_multiple_line_text(image, text, font, text_color, text_start_height):
@@ -165,6 +176,7 @@ def get_output_video(text):
         text_to_add = sub_names[k][0]
         result = add_text_to_img(text_to_add, imagenes)
         generated_images_sub.append(result)
     # Step  7 - Creation of audio
     c = 0
@@ -172,7 +184,7 @@ def get_output_video(text):
     mp3_lengths = []
     for k in range(len(generated_images)):
         text_to_add = sub_names[k][0]
-        print(text_to_add)
         f_name = 'audio_' + str(c) + '.mp3'
         mp3_names.append(f_name)
         # The text that you want to convert to audio
@@ -190,7 +202,7 @@ def get_output_video(text):
         audio = AudioSegment.from_file(sound_file, format="mp3")
         duration = len(audio) / 1000
         mp3_lengths.append(duration)
-        print(duration)
         c += 1
     # Step 8 - Merge audio files
@@ -201,16 +213,16 @@ def get_output_video(text):
     for n, mp3_file in enumerate(mp3_names):
         mp3_file = mp3_file.replace(chr(92), '/')
-        print(n, mp3_file)
         # Load the current mp3 into `audio_segment`
         audio_segment = AudioSegment.from_mp3(mp3_file)
         # Just accumulate the new `audio_segment` + `silence`
         full_audio += audio_segment + silence
-        print('Merging ', n)
     # The loop will exit once all files in the list have been used
     # Then export
     full_audio.export(export_path, format='mp3')
-    print('\ndone!')
     # Step 9 - Creation of the video with adjusted times of the sound
     c = 0
@@ -219,18 +231,20 @@ def get_output_video(text):
         f_name = 'img_' + str(c) + '.jpg'
         file_names.append(f_name)
         img.save(f_name)
         c += 1
-    print(file_names)
     clips = []
     d = 0
     for m in file_names:
         duration = mp3_lengths[d]
-        print(d, duration)
         clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
         d += 1
     concat_clip = mpe.concatenate_videoclips(clips, method="compose")
     concat_clip.write_videofile("result_new.mp4", fps=24)
     # Step 10 - Merge Video + Audio
     movie_name = 'result_new.mp4'
@@ -244,6 +258,7 @@ def get_output_video(text):
         final_clip.write_videofile(outname, fps=fps)
     combine_audio(movie_name, export_path, movie_final)  # create a new file
     # Cleanup intermediate files
     for f in file_names:
@@ -252,8 +267,9 @@ def get_output_video(text):
         os.remove(f)
     os.remove("result_new.mp4")
     os.remove("result.mp3")
     return 'result_final.mp4'
@@ -277,4 +293,4 @@ with demo:
     gr.Markdown(
         "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
-demo.launch(debug=False)

 model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model.to(device)
+print(f"Using device: {device}")
 def get_output_video(text):
+    print("Starting get_output_video function...")
     inputs = tokenizer(text,
                        max_length=1024,
                        truncation=True,
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=False)
     plot = list(summary[0].split('.'))
+    print(f"Summarized plot: {plot}")
     '''
     The required models will be downloaded to models_root if they are not already there.
         '''
     @spaces.GPU(duration=60 * 3)
     def generate_image(
+        is_mega: bool,
+        text: str,
+        seed: int,
+        grid_size: int,
+        top_k: int,
+        image_path: str,
+        models_root: str,
+        fp16: bool,
     ):
+        print(f"Generating image for: {text}")
         model = MinDalle(
             is_mega=is_mega,
             models_root=models_root,
             top_k=top_k,
             is_verbose=True
         )
+        print(f"Image generated successfully.")
         return image
     generated_images = []
+    for i, senten in enumerate(plot[:-1]):
+        print(f"Generating image {i+1} of {len(plot)-1}...")
+        try:
+            image = generate_image(
+                is_mega=True,
+                text=senten,
+                seed=1,
+                grid_size=1,  # param {type:"integer"}
+                top_k=256,  # param {type:"integer"}
+                image_path='generated',
+                models_root='pretrained',
+                fp16=True, )
+            generated_images.append(image)
+            print(f"Image {i+1} generated and appended.")
+        except Exception as e:
+            print(f"Error generating image {i+1}: {e}")
+            raise
     # Step 4- Creation of the subtitles
     sentences = plot[:-1]
     for k in range(len(generated_images)):
         subtitles = tokenize.sent_tokenize(sentences[k])
         sub_names.append(subtitles)
+        print(f"Subtitles generated for image {k+1}: {subtitles}")
     # Step 5- Adding Subtitles to the Images
     def draw_multiple_line_text(image, text, font, text_color, text_start_height):
         text_to_add = sub_names[k][0]
         result = add_text_to_img(text_to_add, imagenes)
         generated_images_sub.append(result)
+        print(f"Subtitles added to image {k+1}.")
     # Step  7 - Creation of audio
     c = 0
     mp3_lengths = []
     for k in range(len(generated_images)):
         text_to_add = sub_names[k][0]
+        print(f"Generating audio for: {text_to_add}")
         f_name = 'audio_' + str(c) + '.mp3'
         mp3_names.append(f_name)
         # The text that you want to convert to audio
         audio = AudioSegment.from_file(sound_file, format="mp3")
         duration = len(audio) / 1000
         mp3_lengths.append(duration)
+        print(f"Audio duration: {duration} seconds")
         c += 1
     # Step 8 - Merge audio files
     for n, mp3_file in enumerate(mp3_names):
         mp3_file = mp3_file.replace(chr(92), '/')
+        print(f"Merging audio file: {mp3_file}")
         # Load the current mp3 into `audio_segment`
         audio_segment = AudioSegment.from_mp3(mp3_file)
         # Just accumulate the new `audio_segment` + `silence`
         full_audio += audio_segment + silence
+        print(f'Merging audio {n+1} completed.')
     # The loop will exit once all files in the list have been used
     # Then export
     full_audio.export(export_path, format='mp3')
+    print('\nAudio merging done!')
     # Step 9 - Creation of the video with adjusted times of the sound
     c = 0
         f_name = 'img_' + str(c) + '.jpg'
         file_names.append(f_name)
         img.save(f_name)
+        print(f"Saving image: {f_name}")
         c += 1
+    print(f"Image file names: {file_names}")
     clips = []
     d = 0
     for m in file_names:
         duration = mp3_lengths[d]
+        print(f"Creating video clip {d+1} with duration: {duration} seconds")
         clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
         d += 1
     concat_clip = mpe.concatenate_videoclips(clips, method="compose")
     concat_clip.write_videofile("result_new.mp4", fps=24)
+    print("Video clips concatenated and saved as result_new.mp4")
     # Step 10 - Merge Video + Audio
     movie_name = 'result_new.mp4'
         final_clip.write_videofile(outname, fps=fps)
     combine_audio(movie_name, export_path, movie_final)  # create a new file
+    print("Video and audio merged successfully!")
     # Cleanup intermediate files
     for f in file_names:
         os.remove(f)
     os.remove("result_new.mp4")
     os.remove("result.mp3")
+    print("Intermediate files cleaned up.")
+    print("Finished get_output_video function.")
     return 'result_final.mp4'
     gr.Markdown(
         "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
+demo.launch(debug=True)