Spaces:

jjz5463
/

Diary-AI-Video

Paused

App Files Files Community

jjz5463 commited on Oct 4, 2024

Commit

060072e

1 Parent(s): 1778d8f

better video quality, simplify prompts, text-to-video

Browse files

Files changed (2) hide show

app.py +1 -2
baseline_utils.py +13 -31

app.py CHANGED Viewed

@@ -34,8 +34,7 @@ def process_images(diary_image, writer_image):
     # Detect text from the diary image
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
-    summarized_text = summarize_diary_text(detected_text, openai_api_key)
-    activities = break_summary_to_activities(summarized_text, openai_api_key)
     activity_list = activities.strip('[]').split(', ')
     # Analyze the writer's image using Gemini API

     # Detect text from the diary image
     google_credentials = get_google_credentials()
     detected_text = detect_text_in_image(diary_image_path, google_credentials)
+    activities = break_summary_to_activities(detected_text, openai_api_key)
     activity_list = activities.strip('[]').split(', ')
     # Analyze the writer's image using Gemini API

baseline_utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import spaces
 import openai
 from google.cloud import vision
 import io
@@ -8,6 +7,7 @@ import torch
 from diffusers.utils import export_to_video
 import numpy as np
 import os
 # Utilize the Google Cloud Vision API to recognize text in the
@@ -35,27 +35,6 @@ def detect_text_in_image(image_path, credentials):
     return texts[0].description if texts else ''
-# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
-# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
-def summarize_diary_text(text, api_key):
-    # Initialize the OpenAI client
-    client = openai.Client(api_key=api_key)
-    # Use the client to call the chat completion API
-    response = client.chat.completions.create(
-        model="gpt-4",  # Use GPT-4
-        messages=[
-            {"role": "user", "content": f"Summarize the following diary entry: {text}"}
-        ],
-        max_tokens=150,
-        temperature=0.7,
-        n=1  # Number of completions to generate
-    )
-    # Extract the summary from the response
-    return response.choices[0].message.content
 def break_summary_to_activities(text, api_key):
     # Initialize the OpenAI client
     client = openai.Client(api_key=api_key)
@@ -64,12 +43,13 @@ def break_summary_to_activities(text, api_key):
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
-            {"role": "user", "content": f"Please break the following summary into four distinct activities, "
-                                        f"formatted as 'I am [activity].' Each activity should describe a unique action "
-                                        f"and be less than six words: {text}. "
-                                        f"Return the four activities as a list in this "
-                                        f"format: [activity1, activity2, activity3, activity4], "
-                                        f"without any quotation marks or extra text."}
         ],
         max_tokens=150,
         temperature=0.7,
@@ -89,9 +69,10 @@ def analyze_writer_image(image_path, api_key):
     model = genai.GenerativeModel("gemini-1.5-flash")
     myfile = genai.upload_file(image_path)
     result = model.generate_content(
-        [myfile, "\n\n",
          "Provide a description of the people in the picture, "
-         "focusing on their characteristics. Keep it under five words."]
     )
     return result.text
@@ -127,7 +108,7 @@ def generate_video(activity_list, writer_summary, fps=24):  # Lower fps
     os.makedirs("videos", exist_ok=True)
     video_paths = []
     for i, prompt in enumerate(prompts):
-        video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=fps).frames
         video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
         video_paths.append(video_path)
@@ -137,3 +118,4 @@ def generate_video(activity_list, writer_summary, fps=24):  # Lower fps

 import openai
 from google.cloud import vision
 import io
 from diffusers.utils import export_to_video
 import numpy as np
 import os
+import spaces
 # Utilize the Google Cloud Vision API to recognize text in the
     return texts[0].description if texts else ''
 def break_summary_to_activities(text, api_key):
     # Initialize the OpenAI client
     client = openai.Client(api_key=api_key)
     response = client.chat.completions.create(
         model="gpt-4",  # Use GPT-4
         messages=[
+            {"role": "user",
+             "content": f"Please break the following diary into exactly four most important activities. "
+                        f"Each activity must be formatted as 'I am [activity]' and must describe only one specific action. "
+                        f"Make sure each activity is distinct and only contains a single action (e.g., no combinations like 'eating and teaching'). "
+                        f"Additionally, each activity should be no more than six words: {text}. "
+                        f"Return the four activities as a list in the following format: "
+                        f"[activity1, activity2, activity3, activity4], without any quotation marks, extra text, or explanations."}
         ],
         max_tokens=150,
         temperature=0.7,
     model = genai.GenerativeModel("gemini-1.5-flash")
     myfile = genai.upload_file(image_path)
     result = model.generate_content(
+        [myfile,
          "Provide a description of the people in the picture, "
+         "focusing on their characteristics. Keep it under five words "
+         "and ensure the description does not contain any line breaks, extra spaces, or unnecessary characters at the end."]
     )
     return result.text
     os.makedirs("videos", exist_ok=True)
     video_paths = []
     for i, prompt in enumerate(prompts):
+        video_frames = pipe(prompt, num_inference_steps=60, height=320, width=576, num_frames=fps).frames
         video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
         video_paths.append(video_path)