File size: 10,530 Bytes
f16ac55
379b5e6
f16ac55
 
 
5de6ba9
f16ac55
 
be875c5
060072e
477f3cf
45f2ffe
379b5e6
 
943206a
f16ac55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de6ba9
 
 
 
 
 
 
 
5b69386
 
379b5e6
5b69386
 
 
379b5e6
 
 
5b69386
379b5e6
5b69386
 
5c142c7
5de6ba9
 
eaedc31
943206a
 
 
5c142c7
 
 
5de6ba9
943206a
 
 
 
5de6ba9
 
5c142c7
 
 
 
9fd112f
eaedc31
f05d3d0
5de6ba9
 
943206a
5de6ba9
 
943206a
 
 
 
 
 
 
0575e8c
 
 
 
 
 
 
 
 
 
 
 
 
 
64b1c2d
0575e8c
 
 
 
 
 
 
 
 
 
1c93d48
0575e8c
97af337
45f2ffe
 
 
97af337
 
 
 
 
45f2ffe
97af337
 
0575e8c
45f2ffe
 
 
 
 
 
 
 
 
 
6d94bcb
45f2ffe
6d94bcb
45f2ffe
 
 
 
 
37a978c
45f2ffe
 
 
 
 
 
 
085e218
eaedc31
379b5e6
 
45f2ffe
379b5e6
45f2ffe
eaedc31
 
 
379b5e6
eaedc31
 
be875c5
eaedc31
 
be875c5
477f3cf
 
 
 
 
 
 
 
be875c5
477f3cf
 
379b5e6
97af337
379b5e6
6d94bcb
 
 
 
 
 
 
 
 
 
 
379b5e6
 
 
 
 
 
6d94bcb
379b5e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import openai

from google.cloud import vision
import io
import google.generativeai as genai
from diffusers import CogVideoXPipeline
import torch
from diffusers.utils import export_to_video
import os
import spaces
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
from transformers import pipeline
import requests
from transformers import pipeline

# Utilize the Google Cloud Vision API to recognize text in the
# input input_images (diary input_images), https://cloud.google.com/vision.
def detect_text_in_image(image_path, credentials):

    client = vision.ImageAnnotatorClient(credentials=credentials)

    # Open the image file
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    # Create an image object for the Vision API
    image = vision.Image(content=content)

    # Use the Vision API to detect text
    response = client.text_detection(image=image)
    texts = response.text_annotations

    # Check for errors in the response
    if response.error.message:
        raise Exception(f'{response.error.message}')

    # Return the detected text or an empty string
    return texts[0].description if texts else ''


# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
# and output a textual description of the image,
# https://ai.google.dev/gemini-api/docs/models/gemini.
# Mock example assuming an API request to Gemini
def analyze_writer_image(image_path, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")
    myfile = genai.upload_file(image_path)
    color = model.generate_content([myfile,"What is the predominant color of the person in the image?"])
    description = f"""
    The main character is a cartoonish, fluffy cat with large, expressive blue eyes.
    Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
    The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.

    The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
    The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
    a gradient effect from darker to lighter shades of {color} at the edges.

    The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
    The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
    """
    return description


def break_diary_to_scenes(diary_text, api_key):
    # Initialize the OpenAI client
    client = openai.Client(api_key=api_key)

    example_1 = 'Main Character head bowed and determination etched on his face, sprints through the torrential downpour as lightning crackles and thunder rumbles in the distance. The relentless rain pounds the ground, creating a chaotic dance of water droplets that mirror the dramatic sky\'s anger. In the far background, the silhouette of a cozy home beckons, a faint beacon of safety and warmth amidst the fierce weather. The scene is one of perseverance and the unyielding spirit of a child braving the elements.'
    example_2 = 'Main Character, with the red dust of Mars clinging to their boots, reaches out to shake hands with an alien being, their skin a shimmering blue, under the pink-tinged sky of the fourth planet. In the background, a sleek silver rocket, a beacon of human ingenuity, stands tall, its engines powered down, as the two representatives of different worlds exchange a historic greeting amidst the desolate beauty of the Martian landscape.'
    example_3 = 'Main Character, with a serene expression, sits at the water\'s edge, a steaming cup of tea by his side. He is engrossed in his artwork, brush in hand, as he renders an oil painting on a canvas that\'s propped up against a small, weathered table. The sea breeze whispers through his silver hair, gently billowing his loose-fitting white shirt, while the salty air adds an intangible element to his masterpiece in progress. The scene is one of tranquility and inspiration, with the artist\'s canvas capturing the vibrant hues of the setting sun reflecting off the tranquil sea.'

    # Use the client to call the chat completion API
    response = client.chat.completions.create(
        model="gpt-4",  # Use GPT-4
        messages=[
            {
                "role": "user",
                "content": f"Please break the following diary into four distinct cartoon movie scenes: {diary_text}. "
                           f"Each scene should focus on one unique action and be described in vivid, animated detail. Below are some examples for the desired style: "
                           f"Example 1: {example_1}. Example 2: {example_2}. Example 3: {example_3}."
                           f"Use 'Main Character' as the placeholder for the subject in the scene. "
                           f"Each scene should be main character is doing something."
                           f"Ensure that each scene features only one action, with no combinations (e.g., avoid 'eating and teaching' in one scene). "
                           f"Please use expressive, cinematic language to bring the cartoon scene to life, focusing on the character’s actions, expressions, and environment. "
                           f"Return the output as a list in this format: Scene 1: , Scene 2: , Scene 3: , Scene 4: , without any quotation marks or line breaks."
            }
        ],
        max_tokens=1000,
        temperature=1,
        n=1  # Number of completions to generate
    )

    # Extract the summary from the response
    return response.choices[0].message.content


def scenes_caption(scenes, api_key):
    # Initialize the OpenAI client
    client = openai.Client(api_key=api_key)

    captions = []

    for scene in scenes:
        # Use OpenAI's GPT API to generate a caption for each scene
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "user",
                    "content": f"Given the scene: {scene}, "
                               f"turn this scene into a simple caption starting with I am doing something."
                               f"Be concise, keeping it under 10 words. Return without any quotation marks."
                }
            ],
            max_tokens=50,  # Limit to a reasonable number of tokens for short captions
            temperature=0.7,  # Adjust creativity level as needed
            n=1
        )
        # Append the generated caption to the list
        captions.append(response.choices[0].message.content)

    return "\n\n".join(captions)


@spaces.GPU
def generate_video(scene_list, writer_description, opt, fps=24):  # Lower fps

    pipe = CogVideoXPipeline.from_pretrained(
        "THUDM/CogVideoX-5b",
        torch_dtype=torch.bfloat16,
        cache_dir="./CogVideoX-5b"
    )

    pipe.enable_model_cpu_offload()
    pipe.vae.enable_tiling()

    # Check for available device: CUDA, MPS, or CPU
    if torch.cuda.is_available():
        device = "cuda"
        print("Using CUDA backend.")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("Using MPS backend.")
    else:
        device = "cpu"
        print("CUDA and MPS not available. Falling back to CPU.")

    # Truncate the prompt to fit the CLIP token limit
    os.makedirs("videos", exist_ok=True)
    video_paths = []
    for i, prompt in enumerate(scene_list):
        video = pipe(
            prompt=prompt + f'\nThe main character is described as: {writer_description}.',
            num_videos_per_prompt=1,
            num_inference_steps=40,
            num_frames=fps,
            guidance_scale=6,
            generator=torch.Generator(device=device).manual_seed(42),
        ).frames[0]

        video_path = export_to_video(video, output_video_path=f'videos/video{i}.mp4')
        video_paths.append(video_path)

    # Concatenate the generated videos into a single video
    concatenated_video_path = "videos/combined_video_music.mp4"
    if opt == "Narration":
      concatenate_videos_music(video_paths, concatenated_video_path, audio_path="narration.mp3")
    else:
      concatenate_videos_music(video_paths, concatenated_video_path, audio_path="meow-meow-meow-tiktok.mp3")
    return concatenated_video_path


def concatenate_videos_music(video_paths, output_path, audio_path):
    # Load each video file as a VideoFileClip
    clips = [VideoFileClip(video) for video in video_paths]

    # Concatenate the clips
    final_clip = concatenate_videoclips(clips, method="compose")

    # If an audio file is provided, load it and trim if necessary
    if audio_path:
        audio = AudioFileClip(audio_path)
        # Trim the audio to match the final video's duration if audio is longer
        if audio.duration > final_clip.duration:
            audio = audio.subclip(0, final_clip.duration)
        # Set the trimmed audio to the final video clip
        final_clip = final_clip.set_audio(audio)

    # Write the concatenated video to a file
    final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")

@spaces.GPU
def summarizer_for_audio(input_text):
  if torch.cuda.is_available():
      device = "cuda"
      print("Using CUDA backend.")
  elif torch.backends.mps.is_available():
      device = "mps"
      print("Using MPS backend.")
  else:
      device = "cpu"
      print("CUDA and MPS not available. Falling back to CPU.")

  summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

  # Generate the summary
  summary = summarizer(
      input_text,
      max_length=40,
      min_length=30,

      do_sample=False
  )[0]["summary_text"]

  return summary

def narration_generate(input, api_key):
  url = "https://api.elevenlabs.io/v1/text-to-speech/9BWtsMINqrJLrRacOk9x"
  headers = {
  "Accept": "audio/mpeg",
  "Content-Type": "application/json",
  "xi-api-key": api_key
  }

  data = {
    "text": input,
    "model_id": "eleven_monolingual_v1",
    "voice_settings": {
      "stability": 0.5,
      "similarity_boost": 0.5
    }
  }

  response = requests.post(url, json=data, headers=headers)
  with open('narration.mp3', 'wb') as f:
      for chunk in response.iter_content(chunk_size=1024):
          if chunk:
              f.write(chunk)