Spaces:
Paused
Paused
better video quality, simplify prompts, text-to-video
Browse files- app.py +1 -2
- baseline_utils.py +13 -31
app.py
CHANGED
@@ -34,8 +34,7 @@ def process_images(diary_image, writer_image):
|
|
34 |
# Detect text from the diary image
|
35 |
google_credentials = get_google_credentials()
|
36 |
detected_text = detect_text_in_image(diary_image_path, google_credentials)
|
37 |
-
|
38 |
-
activities = break_summary_to_activities(summarized_text, openai_api_key)
|
39 |
activity_list = activities.strip('[]').split(', ')
|
40 |
|
41 |
# Analyze the writer's image using Gemini API
|
|
|
34 |
# Detect text from the diary image
|
35 |
google_credentials = get_google_credentials()
|
36 |
detected_text = detect_text_in_image(diary_image_path, google_credentials)
|
37 |
+
activities = break_summary_to_activities(detected_text, openai_api_key)
|
|
|
38 |
activity_list = activities.strip('[]').split(', ')
|
39 |
|
40 |
# Analyze the writer's image using Gemini API
|
baseline_utils.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import spaces
|
2 |
import openai
|
3 |
from google.cloud import vision
|
4 |
import io
|
@@ -8,6 +7,7 @@ import torch
|
|
8 |
from diffusers.utils import export_to_video
|
9 |
import numpy as np
|
10 |
import os
|
|
|
11 |
|
12 |
|
13 |
# Utilize the Google Cloud Vision API to recognize text in the
|
@@ -35,27 +35,6 @@ def detect_text_in_image(image_path, credentials):
|
|
35 |
return texts[0].description if texts else ''
|
36 |
|
37 |
|
38 |
-
# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
|
39 |
-
# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
|
40 |
-
def summarize_diary_text(text, api_key):
|
41 |
-
# Initialize the OpenAI client
|
42 |
-
client = openai.Client(api_key=api_key)
|
43 |
-
|
44 |
-
# Use the client to call the chat completion API
|
45 |
-
response = client.chat.completions.create(
|
46 |
-
model="gpt-4", # Use GPT-4
|
47 |
-
messages=[
|
48 |
-
{"role": "user", "content": f"Summarize the following diary entry: {text}"}
|
49 |
-
],
|
50 |
-
max_tokens=150,
|
51 |
-
temperature=0.7,
|
52 |
-
n=1 # Number of completions to generate
|
53 |
-
)
|
54 |
-
|
55 |
-
# Extract the summary from the response
|
56 |
-
return response.choices[0].message.content
|
57 |
-
|
58 |
-
|
59 |
def break_summary_to_activities(text, api_key):
|
60 |
# Initialize the OpenAI client
|
61 |
client = openai.Client(api_key=api_key)
|
@@ -64,12 +43,13 @@ def break_summary_to_activities(text, api_key):
|
|
64 |
response = client.chat.completions.create(
|
65 |
model="gpt-4", # Use GPT-4
|
66 |
messages=[
|
67 |
-
{"role": "user",
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
|
|
73 |
],
|
74 |
max_tokens=150,
|
75 |
temperature=0.7,
|
@@ -89,9 +69,10 @@ def analyze_writer_image(image_path, api_key):
|
|
89 |
model = genai.GenerativeModel("gemini-1.5-flash")
|
90 |
myfile = genai.upload_file(image_path)
|
91 |
result = model.generate_content(
|
92 |
-
[myfile,
|
93 |
"Provide a description of the people in the picture, "
|
94 |
-
"focusing on their characteristics. Keep it under five words
|
|
|
95 |
)
|
96 |
return result.text
|
97 |
|
@@ -127,7 +108,7 @@ def generate_video(activity_list, writer_summary, fps=24): # Lower fps
|
|
127 |
os.makedirs("videos", exist_ok=True)
|
128 |
video_paths = []
|
129 |
for i, prompt in enumerate(prompts):
|
130 |
-
video_frames = pipe(prompt, num_inference_steps=
|
131 |
video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
|
132 |
video_paths.append(video_path)
|
133 |
|
@@ -137,3 +118,4 @@ def generate_video(activity_list, writer_summary, fps=24): # Lower fps
|
|
137 |
|
138 |
|
139 |
|
|
|
|
|
|
1 |
import openai
|
2 |
from google.cloud import vision
|
3 |
import io
|
|
|
7 |
from diffusers.utils import export_to_video
|
8 |
import numpy as np
|
9 |
import os
|
10 |
+
import spaces
|
11 |
|
12 |
|
13 |
# Utilize the Google Cloud Vision API to recognize text in the
|
|
|
35 |
return texts[0].description if texts else ''
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def break_summary_to_activities(text, api_key):
|
39 |
# Initialize the OpenAI client
|
40 |
client = openai.Client(api_key=api_key)
|
|
|
43 |
response = client.chat.completions.create(
|
44 |
model="gpt-4", # Use GPT-4
|
45 |
messages=[
|
46 |
+
{"role": "user",
|
47 |
+
"content": f"Please break the following diary into exactly four most important activities. "
|
48 |
+
f"Each activity must be formatted as 'I am [activity]' and must describe only one specific action. "
|
49 |
+
f"Make sure each activity is distinct and only contains a single action (e.g., no combinations like 'eating and teaching'). "
|
50 |
+
f"Additionally, each activity should be no more than six words: {text}. "
|
51 |
+
f"Return the four activities as a list in the following format: "
|
52 |
+
f"[activity1, activity2, activity3, activity4], without any quotation marks, extra text, or explanations."}
|
53 |
],
|
54 |
max_tokens=150,
|
55 |
temperature=0.7,
|
|
|
69 |
model = genai.GenerativeModel("gemini-1.5-flash")
|
70 |
myfile = genai.upload_file(image_path)
|
71 |
result = model.generate_content(
|
72 |
+
[myfile,
|
73 |
"Provide a description of the people in the picture, "
|
74 |
+
"focusing on their characteristics. Keep it under five words "
|
75 |
+
"and ensure the description does not contain any line breaks, extra spaces, or unnecessary characters at the end."]
|
76 |
)
|
77 |
return result.text
|
78 |
|
|
|
108 |
os.makedirs("videos", exist_ok=True)
|
109 |
video_paths = []
|
110 |
for i, prompt in enumerate(prompts):
|
111 |
+
video_frames = pipe(prompt, num_inference_steps=60, height=320, width=576, num_frames=fps).frames
|
112 |
video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
|
113 |
video_paths.append(video_path)
|
114 |
|
|
|
118 |
|
119 |
|
120 |
|
121 |
+
|