Spaces:

jjz5463
/

Diary-AI-Video

Paused

App Files Files Community

jjz5463 commited on Oct 4, 2024

Commit

943206a

1 Parent(s): 2f3080f

4 videos instead of 1

Browse files

Files changed (2) hide show

app.py +39 -17
baseline_utils.py +54 -93

app.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import gradio as gr
 import openai
 import json
-from PIL import Image
 from google.oauth2 import service_account
-from baseline_utils import detect_text_in_image, summarize_diary_text, analyze_writer_image, generate_video
 import os
 # Load secrets from Hugging Face Spaces environment
@@ -14,10 +17,12 @@ gemini_api_key = os.getenv("GEMINI_API_KEY")
 # Initialize OpenAI
 openai.api_key = openai_api_key
 # Function to get Google credentials
 def get_google_credentials():
     return service_account.Credentials.from_service_account_info(google_service_account_info)
 def process_images(diary_image, writer_image):
     # Save the file-like objects as image files
     diary_image_path = "temp_upload_images/temp_diary_image.png"
@@ -30,34 +35,51 @@ def process_images(diary_image, writer_image):
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
     summarized_text = summarize_diary_text(detected_text, openai_api_key)
     # Analyze the writer's image using Gemini API
     writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
     # Generate the video based on the summaries
-    video_path = generate_video(summarized_text, writer_summary, fps=24)
-    return video_path
 # Define the Gradio interface
 def gradio_interface(diary_image, writer_image):
-    # Process the images and generate the video
-    generated_video = process_images(diary_image, writer_image)
-    # Return the path to the generated video
-    return generated_video
 # Set up the Gradio interface
-interface = gr.Interface(
-    fn=gradio_interface,
-    inputs=[
-        gr.Image(label="Upload your handwritten diary image", type="pil"),
-        gr.Image(label="Upload a photo of the writer", type="pil"),
-    ],
-    outputs=gr.Video(label="Generated Video"),
-    title="Handwritten Diary to Video"
-)
 # Launch the interface
 interface.launch()

 import gradio as gr
 import openai
 import json
 from google.oauth2 import service_account
+from baseline_utils import (detect_text_in_image,
+                            summarize_diary_text,
+                            analyze_writer_image,
+                            generate_video,
+                            break_summary_to_activities)
 import os
 # Load secrets from Hugging Face Spaces environment
 # Initialize OpenAI
 openai.api_key = openai_api_key
 # Function to get Google credentials
 def get_google_credentials():
     return service_account.Credentials.from_service_account_info(google_service_account_info)
 def process_images(diary_image, writer_image):
     # Save the file-like objects as image files
     diary_image_path = "temp_upload_images/temp_diary_image.png"
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
     summarized_text = summarize_diary_text(detected_text, openai_api_key)
+    activities = break_summary_to_activities(summarized_text, openai_api_key)
+    activity_list = activities.strip('[]').split(', ')
     # Analyze the writer's image using Gemini API
     writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
     # Generate the video based on the summaries
+    video_paths = generate_video(activity_list, writer_summary, fps=24)
+    return video_paths
 # Define the Gradio interface
 def gradio_interface(diary_image, writer_image):
+    # Process the images and generate the videos
+    video_generator = process_images(diary_image, writer_image)
+    # Use streaming to return each video path as it's ready
+    for video_path in video_generator:
+        yield video_path
 # Set up the Gradio interface
+with gr.Blocks() as interface:
+    gr.Markdown("# Handwritten Diary to Video")
+    with gr.Row():
+        diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
+        writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
+    submit_button = gr.Button("Generate Videos")
+    with gr.Row():
+        with gr.Column():
+            video_output_1 = gr.Video(label="Generated Video 1")
+            video_output_2 = gr.Video(label="Generated Video 2")
+        with gr.Column():
+            video_output_3 = gr.Video(label="Generated Video 3")
+            video_output_4 = gr.Video(label="Generated Video 4")
+    # Use streaming=True to display each video as soon as it's ready
+    submit_button.click(fn=gradio_interface,
+                        inputs=[diary_image_input, writer_image_input],
+                        outputs=[video_output_1, video_output_2, video_output_3, video_output_4],
+                        stream=True)
 # Launch the interface
 interface.launch()

baseline_utils.py CHANGED Viewed

@@ -1,20 +1,18 @@
 import openai
 from google.cloud import vision
-from google.oauth2 import service_account
 import io
 import google.generativeai as genai
-from diffusers import AutoPipelineForText2Image, DiffusionPipeline
 import torch
-import os
-from moviepy.editor import ImageSequenceClip
 from diffusers.utils import export_to_video
 import spaces
 # Utilize the Google Cloud Vision API to recognize text in the
 # input input_images (diary input_images), https://cloud.google.com/vision.
 def detect_text_in_image(image_path, credentials):
-    # Create a Vision API client using the credentials
     client = vision.ImageAnnotatorClient(credentials=credentials)
     # Open the image file
@@ -46,7 +44,6 @@ def summarize_diary_text(text, api_key):
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": f"Summarize the following diary entry: {text}"}
         ],
         max_tokens=150,
@@ -58,6 +55,30 @@ def summarize_diary_text(text, api_key):
     return response.choices[0].message.content
 # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
 # and output a textual description of the image,
 # https://ai.google.dev/gemini-api/docs/models/gemini.
@@ -67,104 +88,44 @@ def analyze_writer_image(image_path, api_key):
     model = genai.GenerativeModel("gemini-1.5-flash")
     myfile = genai.upload_file(image_path)
     result = model.generate_content(
-        [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
     )
     return result.text
-# Now that you have text from the diary and text describing the diary writer,
-# you can utilize the SDXL-Turbo stable diffusion model to generate
-# input_images https://huggingface.co/stabilityai/sdxl-turbo.
-# You can try to output several input_images for a diary entry. Analyze how accurate the results,
-# and think about what could be improved.
-# def generate_comic_book(diary_text, writer_description, num_pages=4):
-#     pipe = AutoPipelineForText2Image.from_pretrained(
-#         "stabilityai/sdxl-turbo",
-#         torch_dtype=torch.float16,
-#         variant="fp16",
-#         cache_dir="./SDXL-Turbo"
-#     )
-#
-#     # Check for available device: CUDA, MPS, or CPU
-#     if torch.cuda.is_available():
-#         device = "cuda"
-#         print("Using CUDA backend.")
-#     elif torch.backends.mps.is_available():
-#         device = "mps"
-#         print("Using MPS backend.")
-#     else:
-#         device = "cpu"
-#         print("CUDA and MPS not available. Falling back to CPU.")
-#
-#     # Move the model to the selected device
-#     pipe = pipe.to(device)
-#
-#     # Create a directory to store the comic book input_images
-#     os.makedirs("comic_book", exist_ok=True)
-#
-#     # Split diary text into multiple segments/scenes for comic book pages
-#     diary_scenes = diary_text.split('.')[:num_pages]  # Split by periods, limiting to `num_pages`
-#
-#     # Iterate over each scene, generating a page for each one
-#     for i, scene in enumerate(diary_scenes):
-#         prompt = (f'Comic Book Style: \n'
-#                   f'Actor Description: {writer_description} \n'
-#                   f'Diary Scene: {scene.strip()}\n'
-#                   f'Generate an cartoon image to represent this diary scene.')
-#
-#         print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
-#
-#         # Generate the image
-#         image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
-#
-#         # Save the generated image
-#         image_path = f"comic_book/page_{i + 1}.png"
-#         image.save(image_path)
-#         print(f"Page {i + 1} saved as {image_path}")
-#
-#     print("Comic book generation complete!")
-def truncate_prompt(prompt, max_tokens=77):
-    tokens = prompt.split()
-    if len(tokens) > max_tokens:
-        return " ".join(tokens[:max_tokens])
-    return prompt
 @spaces.GPU
-def generate_video(diary_text, writer_description, fps=15):  # Lower fps
     # Load the Zeroscope video generation model
     pipe = DiffusionPipeline.from_pretrained(
         "cerspense/zeroscope_v2_576w",  # Zeroscope model from Hugging Face
-        torch_dtype=torch.float16
     )
     # Check for available device: CUDA, MPS, or CPU
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     pipe = pipe.to(device)
-    # Define the total number of frames needed for a 15-second video at the given fps
-    total_frames = 10 * fps
     # Combine the diary text and writer description for a cohesive prompt
-    prompt = (f"Actor Description: {writer_description}\n"
-              f"Diary Scene: {diary_text.strip()}\n"
-              f"Generate a 15-second video based on this scene.")
-    # Truncate the prompt to fit the CLIP token limit
-    prompt = truncate_prompt(prompt)
-    # Generate the video frames
-    video_frames = pipe(
-        prompt=prompt,
-        num_inference_steps=1,  # Minimum inference steps to reduce computation
-        height=128,  # Reduce resolution as much as possible
-        width=128,
-        num_frames=total_frames  # Number of frames stays the same to keep video length
-    ).frames
-    # Save the video
-    video_path = export_to_video(video_frames)
-    print(f"Video generation complete! Saved as {video_path}")
-    return video_path

 import openai
 from google.cloud import vision
 import io
 import google.generativeai as genai
+from diffusers import DiffusionPipeline
 import torch
 from diffusers.utils import export_to_video
+import numpy as np
 import spaces
 # Utilize the Google Cloud Vision API to recognize text in the
 # input input_images (diary input_images), https://cloud.google.com/vision.
 def detect_text_in_image(image_path, credentials):
     client = vision.ImageAnnotatorClient(credentials=credentials)
     # Open the image file
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
             {"role": "user", "content": f"Summarize the following diary entry: {text}"}
         ],
         max_tokens=150,
     return response.choices[0].message.content
+def break_summary_to_activities(text, api_key):
+    # Initialize the OpenAI client
+    client = openai.Client(api_key=api_key)
+    # Use the client to call the chat completion API
+    response = client.chat.completions.create(
+        model="gpt-4",  # Use GPT-4
+        messages=[
+            {"role": "user", "content": f"Please break the following summary into four distinct activities, "
+                                        f"formatted as 'I am [activity].' Each activity should describe a unique action "
+                                        f"and be less than six words: {text}. "
+                                        f"Return the four activities as a list in this "
+                                        f"format: [activity1, activity2, activity3, activity4], "
+                                        f"without any quotation marks or extra text."}
+        ],
+        max_tokens=150,
+        temperature=0.7,
+        n=1  # Number of completions to generate
+    )
+    # Extract the summary from the response
+    return response.choices[0].message.content
 # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
 # and output a textual description of the image,
 # https://ai.google.dev/gemini-api/docs/models/gemini.
     model = genai.GenerativeModel("gemini-1.5-flash")
     myfile = genai.upload_file(image_path)
     result = model.generate_content(
+        [myfile, "\n\n",
+         "Provide a description of the people in the picture, "
+         "focusing on their characteristics. Keep it under five words."]
     )
     return result.text
 @spaces.GPU
+def generate_video(activity_list, writer_summary, fps=24):
     # Load the Zeroscope video generation model
     pipe = DiffusionPipeline.from_pretrained(
         "cerspense/zeroscope_v2_576w",  # Zeroscope model from Hugging Face
+        torch_dtype=torch.float16,
+        cache_dir="./zeroscope"
     )
     # Check for available device: CUDA, MPS, or CPU
+    if torch.cuda.is_available():
+        device = "cuda"
+        print("Using CUDA backend.")
+    elif torch.backends.mps.is_available():
+        device = "mps"
+        print("Using MPS backend.")
+    else:
+        device = "cpu"
+        print("CUDA and MPS not available. Falling back to CPU.")
     pipe = pipe.to(device)
     # Combine the diary text and writer description for a cohesive prompt
+    prompts = []
+    for activity in activity_list:
+        prompt = writer_summary.strip('.').capitalize() + ' is' + activity[4:]
+        prompts.append(prompt)
+    # Stream video results as soon as they are ready
+    for i, prompt in enumerate(prompts):
+        video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=fps).frames
+        video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'video_{i + 1}.mp4')
+        # Yield the path for each video
+        yield video_path