Spaces:

jjz5463
/

Diary-AI-Video

Paused

App Files Files Community

jjz5463 commited on Oct 4, 2024

Commit

f16ac55

1 Parent(s): f921536

initial commit

Browse files

Files changed (4) hide show

README.md +0 -2
app.py +65 -0
baseline_utils.py +169 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -8,5 +8,3 @@ sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import gradio as gr
+import openai
+import json
+from PIL import Image
+from google.oauth2 import service_account
+from baseline_utils import detect_text_in_image, summarize_diary_text, analyze_writer_image, generate_video
+import os
+from keys.keys import *
+# Load secrets from the environment or other sources (adjust as needed)
+openai_api_key = open_ai_keys
+with open('keys/service_account_credentials.json') as f:
+    google_service_account_info = json.load(f)
+gemini_api_key = gemini_keys
+# Initialize OpenAI
+openai.api_key = openai_api_key
+# Function to get Google credentials
+def get_google_credentials():
+    return service_account.Credentials.from_service_account_info(google_service_account_info)
+def process_images(diary_image, writer_image):
+    # Save the file-like objects as image files
+    diary_image_path = "temp_upload_images/temp_diary_image.png"
+    writer_image_path = "temp_upload_images/temp_writer_image.png"
+    os.makedirs("temp_upload_images", exist_ok=True)
+    diary_image.save(diary_image_path)
+    writer_image.save(writer_image_path)
+    # Detect text from the diary image
+    google_credentials = get_google_credentials()
+    detected_text = detect_text_in_image(diary_image_path, google_credentials)
+    summarized_text = summarize_diary_text(detected_text, openai_api_key)
+    # Analyze the writer's image using Gemini API
+    writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
+    # Generate the video based on the summaries
+    video_path = generate_video(summarized_text, writer_summary, fps=24)
+    return video_path
+# Define the Gradio interface
+def gradio_interface(diary_image, writer_image):
+    # Process the images and generate the video
+    generated_video = process_images(diary_image, writer_image)
+    # Return the path to the generated video
+    return generated_video
+# Set up the Gradio interface
+interface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.Image(label="Upload your handwritten diary image", type="pil"),
+        gr.Image(label="Upload a photo of the writer", type="pil"),
+    ],
+    outputs=gr.Video(label="Generated Video"),
+    title="Handwritten Diary to Video"
+)
+# Launch the interface
+interface.launch()

baseline_utils.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import openai
+from google.cloud import vision
+from google.oauth2 import service_account
+import io
+import google.generativeai as genai
+from diffusers import AutoPipelineForText2Image, DiffusionPipeline
+import torch
+import os
+from moviepy.editor import ImageSequenceClip
+from diffusers.utils import export_to_video
+# Utilize the Google Cloud Vision API to recognize text in the
+# input input_images (diary input_images), https://cloud.google.com/vision.
+def detect_text_in_image(image_path, credentials):
+    # Create a Vision API client using the credentials
+    client = vision.ImageAnnotatorClient(credentials=credentials)
+    # Open the image file
+    with io.open(image_path, 'rb') as image_file:
+        content = image_file.read()
+    # Create an image object for the Vision API
+    image = vision.Image(content=content)
+    # Use the Vision API to detect text
+    response = client.text_detection(image=image)
+    texts = response.text_annotations
+    # Check for errors in the response
+    if response.error.message:
+        raise Exception(f'{response.error.message}')
+    # Return the detected text or an empty string
+    return texts[0].description if texts else ''
+# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
+# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
+def summarize_diary_text(text, api_key):
+    # Initialize the OpenAI client
+    client = openai.Client(api_key=api_key)
+    # Use the client to call the chat completion API
+    response = client.chat.completions.create(
+        model="gpt-4",  # Use GPT-4
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": f"Summarize the following diary entry: {text}"}
+        ],
+        max_tokens=150,
+        temperature=0.7,
+        n=1  # Number of completions to generate
+    )
+    # Extract the summary from the response
+    return response.choices[0].message.content
+# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
+# and output a textual description of the image,
+# https://ai.google.dev/gemini-api/docs/models/gemini.
+# Mock example assuming an API request to Gemini
+def analyze_writer_image(image_path, api_key):
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-1.5-flash")
+    myfile = genai.upload_file(image_path)
+    result = model.generate_content(
+        [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
+    )
+    return result.text
+# Now that you have text from the diary and text describing the diary writer,
+# you can utilize the SDXL-Turbo stable diffusion model to generate
+# input_images https://huggingface.co/stabilityai/sdxl-turbo.
+# You can try to output several input_images for a diary entry. Analyze how accurate the results,
+# and think about what could be improved.
+# def generate_comic_book(diary_text, writer_description, num_pages=4):
+#     pipe = AutoPipelineForText2Image.from_pretrained(
+#         "stabilityai/sdxl-turbo",
+#         torch_dtype=torch.float16,
+#         variant="fp16",
+#         cache_dir="./SDXL-Turbo"
+#     )
+#
+#     # Check for available device: CUDA, MPS, or CPU
+#     if torch.cuda.is_available():
+#         device = "cuda"
+#         print("Using CUDA backend.")
+#     elif torch.backends.mps.is_available():
+#         device = "mps"
+#         print("Using MPS backend.")
+#     else:
+#         device = "cpu"
+#         print("CUDA and MPS not available. Falling back to CPU.")
+#
+#     # Move the model to the selected device
+#     pipe = pipe.to(device)
+#
+#     # Create a directory to store the comic book input_images
+#     os.makedirs("comic_book", exist_ok=True)
+#
+#     # Split diary text into multiple segments/scenes for comic book pages
+#     diary_scenes = diary_text.split('.')[:num_pages]  # Split by periods, limiting to `num_pages`
+#
+#     # Iterate over each scene, generating a page for each one
+#     for i, scene in enumerate(diary_scenes):
+#         prompt = (f'Comic Book Style: \n'
+#                   f'Actor Description: {writer_description} \n'
+#                   f'Diary Scene: {scene.strip()}\n'
+#                   f'Generate an cartoon image to represent this diary scene.')
+#
+#         print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
+#
+#         # Generate the image
+#         image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+#
+#         # Save the generated image
+#         image_path = f"comic_book/page_{i + 1}.png"
+#         image.save(image_path)
+#         print(f"Page {i + 1} saved as {image_path}")
+#
+#     print("Comic book generation complete!")
+def truncate_prompt(prompt, max_tokens=77):
+    tokens = prompt.split()
+    if len(tokens) > max_tokens:
+        return " ".join(tokens[:max_tokens])
+    return prompt
+def generate_video(diary_text, writer_description, fps=15):  # Lower fps
+    # Load the Zeroscope video generation model
+    pipe = DiffusionPipeline.from_pretrained(
+        "cerspense/zeroscope_v2_576w",  # Zeroscope model from Hugging Face
+        torch_dtype=torch.float16
+    )
+    # Check for available device: CUDA, MPS, or CPU
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pipe = pipe.to(device)
+    # Define the total number of frames needed for a 15-second video at the given fps
+    total_frames = 15 * fps
+    # Combine the diary text and writer description for a cohesive prompt
+    prompt = (f"Actor Description: {writer_description}\n"
+              f"Diary Scene: {diary_text.strip()}\n"
+              f"Generate a 15-second video based on this scene.")
+    # Truncate the prompt to fit the CLIP token limit
+    prompt = truncate_prompt(prompt)
+    # Generate the video frames
+    video_frames = pipe(
+        prompt=prompt,
+        num_inference_steps=25,  # Reduce inference steps
+        height=320,  # Lower the resolution to save memory
+        width=576,
+        num_frames=total_frames
+    ).frames
+    # Save the video
+    video_path = export_to_video(video_frames)
+    print(f"Video generation complete! Saved as {video_path}")
+    return video_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+openai
+google-cloud-vision
+google-auth
+google-generativeai
+diffusers
+torch
+streamlit
+transformers
+accelerate
+moviepy