|
import gradio as gr |
|
import torch |
|
import moviepy.editor as mpe |
|
from PIL import Image, ImageDraw, ImageFont |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from min_dalle import MinDalle |
|
from gtts import gTTS |
|
from pydub import AudioSegment |
|
import nltk |
|
import textwrap |
|
import os |
|
import glob |
|
import subprocess |
|
import imageio_ffmpeg |
|
import os |
|
|
|
|
|
if os.environ.get("SPACES_ZERO_GPU") is not None: |
|
import spaces |
|
else: |
|
class spaces: |
|
@staticmethod |
|
def GPU(func): |
|
def wrapper(*args, **kwargs): |
|
return func(*args, **kwargs) |
|
return wrapper |
|
|
|
try: |
|
nltk.data.find('tokenizers/punkt') |
|
except LookupError: |
|
nltk.download('punkt') |
|
|
|
|
|
try: |
|
imageio_ffmpeg.get_ffmpeg_exe() |
|
print("FFmpeg downloaded successfully (if not already present).") |
|
except Exception as e: |
|
print(f"Error downloading FFmpeg using imageio_ffmpeg: {e}") |
|
raise |
|
|
|
description = "Video Story Generator with Audio \n PS: Generation of video by using Artificial Intelligence by dalle-mini and distilbart and gtss " |
|
title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss " |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6") |
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
print(f"Using device: {device}") |
|
|
|
|
|
def get_output_video(text): |
|
print("Starting get_output_video function...") |
|
inputs = tokenizer(text, |
|
max_length=1024, |
|
truncation=True, |
|
return_tensors="pt").to(device) |
|
summary_ids = model.generate(inputs["input_ids"]) |
|
summary = tokenizer.batch_decode(summary_ids, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=False) |
|
plot = list(summary[0].split('.')) |
|
print(f"Summarized plot: {plot}") |
|
|
|
''' |
|
The required models will be downloaded to models_root if they are not already there. |
|
Set the dtype to torch.float16 to save GPU memory. |
|
If you have an Ampere architecture GPU you can use torch.bfloat16. |
|
Set the device to either "cuda" or "cpu". Once everything has finished initializing, |
|
float32 is faster than float16 but uses more GPU memory. |
|
''' |
|
@spaces.GPU(duration=60 * 3) |
|
def generate_image( |
|
is_mega: bool, |
|
text: str, |
|
seed: int, |
|
grid_size: int, |
|
top_k: int, |
|
image_path: str, |
|
models_root: str, |
|
fp16: bool, |
|
): |
|
print(f"Generating image for: {text}") |
|
model = MinDalle( |
|
is_mega=is_mega, |
|
models_root=models_root, |
|
is_reusable=True, |
|
is_verbose=True, |
|
dtype=torch.float16 if fp16 else torch.float32, |
|
device=device |
|
) |
|
|
|
|
|
image = model.generate_image( |
|
text, |
|
seed, |
|
grid_size, |
|
top_k=top_k, |
|
is_verbose=True |
|
) |
|
print(f"Image generated successfully.") |
|
return image |
|
|
|
|
|
generated_images = [] |
|
for i, senten in enumerate(plot[:-1]): |
|
print(f"Generating image {i+1} of {len(plot)-1}...") |
|
try: |
|
image = generate_image( |
|
is_mega=True, |
|
text=senten, |
|
seed=1, |
|
grid_size=1, |
|
top_k=256, |
|
image_path='generated', |
|
models_root='pretrained', |
|
fp16=True, ) |
|
generated_images.append(image) |
|
print(f"Image {i+1} generated and appended.") |
|
except Exception as e: |
|
print(f"Error generating image {i+1}: {e}") |
|
raise |
|
|
|
|
|
sentences = plot[:-1] |
|
num_sentences = len(sentences) |
|
assert len(generated_images) == len(sentences), print('Something is wrong') |
|
|
|
from nltk import tokenize |
|
c = 0 |
|
sub_names = [] |
|
for k in range(len(generated_images)): |
|
subtitles = tokenize.sent_tokenize(sentences[k]) |
|
sub_names.append(subtitles) |
|
print(f"Subtitles generated for image {k+1}: {subtitles}") |
|
|
|
|
|
def draw_multiple_line_text(image, text, font, text_color, text_start_height): |
|
draw = ImageDraw.Draw(image) |
|
image_width, image_height = image.size |
|
y_text = text_start_height |
|
lines = textwrap.wrap(text, width=40) |
|
for line in lines: |
|
line_width, line_height = font.getbbox(line)[2:4] |
|
draw.text(((image_width - line_width) / 2, y_text), |
|
line, font=font, fill=text_color) |
|
y_text += line_height |
|
|
|
def add_text_to_img(text1, image_input): |
|
''' |
|
Testing draw_multiple_line_text |
|
''' |
|
image = image_input |
|
fontsize = 20 |
|
path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf" |
|
if not os.path.exists(path_font): |
|
|
|
path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf" |
|
if not os.path.exists(path_font): |
|
print("Font file not found. Subtitles might not be rendered correctly.") |
|
path_font = None |
|
|
|
if path_font is not None: |
|
try: |
|
font = ImageFont.truetype(path_font, fontsize) |
|
text_color = (255, 255, 0) |
|
text_start_height = 200 |
|
draw_multiple_line_text(image, text1, font, text_color, text_start_height) |
|
except Exception as e: |
|
print(f"Error loading or using font: {e}") |
|
|
|
return image |
|
|
|
generated_images_sub = [] |
|
for k in range(len(generated_images)): |
|
imagenes = generated_images[k].copy() |
|
text_to_add = sub_names[k][0] |
|
result = add_text_to_img(text_to_add, imagenes) |
|
generated_images_sub.append(result) |
|
print(f"Subtitles added to image {k+1}.") |
|
|
|
|
|
c = 0 |
|
mp3_names = [] |
|
mp3_lengths = [] |
|
for k in range(len(generated_images)): |
|
text_to_add = sub_names[k][0] |
|
print(f"Generating audio for: {text_to_add}") |
|
f_name = 'audio_' + str(c) + '.mp3' |
|
mp3_names.append(f_name) |
|
|
|
mytext = text_to_add |
|
|
|
language = 'en' |
|
|
|
|
|
|
|
|
|
myobj = gTTS(text=mytext, lang=language, slow=False) |
|
|
|
sound_file = f_name |
|
myobj.save(sound_file) |
|
audio = AudioSegment.from_file(sound_file, format="mp3") |
|
duration = len(audio) / 1000 |
|
mp3_lengths.append(duration) |
|
print(f"Audio duration: {duration} seconds") |
|
c += 1 |
|
|
|
|
|
cwd = os.getcwd().replace(chr(92), '/') |
|
export_path = 'result.mp3' |
|
silence = AudioSegment.silent(duration=500) |
|
full_audio = AudioSegment.empty() |
|
|
|
for n, mp3_file in enumerate(mp3_names): |
|
mp3_file = mp3_file.replace(chr(92), '/') |
|
print(f"Merging audio file: {mp3_file}") |
|
|
|
audio_segment = AudioSegment.from_mp3(mp3_file) |
|
|
|
full_audio += audio_segment + silence |
|
print(f'Merging audio {n+1} completed.') |
|
|
|
|
|
full_audio.export(export_path, format='mp3') |
|
print('\nAudio merging done!') |
|
|
|
|
|
c = 0 |
|
file_names = [] |
|
for img in generated_images_sub: |
|
f_name = 'img_' + str(c) + '.jpg' |
|
file_names.append(f_name) |
|
img.save(f_name) |
|
print(f"Saving image: {f_name}") |
|
c += 1 |
|
print(f"Image file names: {file_names}") |
|
|
|
clips = [] |
|
d = 0 |
|
for m in file_names: |
|
duration = mp3_lengths[d] |
|
print(f"Creating video clip {d+1} with duration: {duration} seconds") |
|
clips.append(mpe.ImageClip(m).set_duration(duration + 0.5)) |
|
d += 1 |
|
concat_clip = mpe.concatenate_videoclips(clips, method="compose") |
|
concat_clip.write_videofile("result_new.mp4", fps=24) |
|
print("Video clips concatenated and saved as result_new.mp4") |
|
|
|
|
|
movie_name = 'result_new.mp4' |
|
export_path = 'result.mp3' |
|
movie_final = 'result_final.mp4' |
|
|
|
def combine_audio(vidname, audname, outname, fps=24): |
|
my_clip = mpe.VideoFileClip(vidname) |
|
audio_background = mpe.AudioFileClip(audname) |
|
final_clip = my_clip.set_audio(audio_background) |
|
final_clip.write_videofile(outname, fps=fps) |
|
|
|
combine_audio(movie_name, export_path, movie_final) |
|
print("Video and audio merged successfully!") |
|
|
|
|
|
for f in file_names: |
|
os.remove(f) |
|
for f in mp3_names: |
|
os.remove(f) |
|
os.remove("result_new.mp4") |
|
os.remove("result.mp3") |
|
print("Intermediate files cleaned up.") |
|
|
|
print("Finished get_output_video function.") |
|
return 'result_final.mp4' |
|
|
|
|
|
text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.' |
|
demo = gr.Blocks() |
|
with demo: |
|
gr.Markdown("# Video Generator from stories with Artificial Intelligence") |
|
gr.Markdown( |
|
"A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.") |
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
input_start_text = gr.Textbox(value=text, |
|
label="Type your story here, for now a sample story is added already!") |
|
with gr.Row(): |
|
button_gen_video = gr.Button("Generate Video") |
|
|
|
with gr.Column(): |
|
output_interpolation = gr.Video(label="Generated Video") |
|
gr.Markdown("<h3>Future Works </h3>") |
|
gr.Markdown( |
|
"This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ") |
|
button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation) |
|
demo.launch(debug=True) |