jjz5463 commited on
Commit
060072e
·
1 Parent(s): 1778d8f

better video quality, simplify prompts, text-to-video

Browse files
Files changed (2) hide show
  1. app.py +1 -2
  2. baseline_utils.py +13 -31
app.py CHANGED
@@ -34,8 +34,7 @@ def process_images(diary_image, writer_image):
34
  # Detect text from the diary image
35
  google_credentials = get_google_credentials()
36
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
37
- summarized_text = summarize_diary_text(detected_text, openai_api_key)
38
- activities = break_summary_to_activities(summarized_text, openai_api_key)
39
  activity_list = activities.strip('[]').split(', ')
40
 
41
  # Analyze the writer's image using Gemini API
 
34
  # Detect text from the diary image
35
  google_credentials = get_google_credentials()
36
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
37
+ activities = break_summary_to_activities(detected_text, openai_api_key)
 
38
  activity_list = activities.strip('[]').split(', ')
39
 
40
  # Analyze the writer's image using Gemini API
baseline_utils.py CHANGED
@@ -1,4 +1,3 @@
1
- import spaces
2
  import openai
3
  from google.cloud import vision
4
  import io
@@ -8,6 +7,7 @@ import torch
8
  from diffusers.utils import export_to_video
9
  import numpy as np
10
  import os
 
11
 
12
 
13
  # Utilize the Google Cloud Vision API to recognize text in the
@@ -35,27 +35,6 @@ def detect_text_in_image(image_path, credentials):
35
  return texts[0].description if texts else ''
36
 
37
 
38
- # Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
39
- # text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
40
- def summarize_diary_text(text, api_key):
41
- # Initialize the OpenAI client
42
- client = openai.Client(api_key=api_key)
43
-
44
- # Use the client to call the chat completion API
45
- response = client.chat.completions.create(
46
- model="gpt-4", # Use GPT-4
47
- messages=[
48
- {"role": "user", "content": f"Summarize the following diary entry: {text}"}
49
- ],
50
- max_tokens=150,
51
- temperature=0.7,
52
- n=1 # Number of completions to generate
53
- )
54
-
55
- # Extract the summary from the response
56
- return response.choices[0].message.content
57
-
58
-
59
  def break_summary_to_activities(text, api_key):
60
  # Initialize the OpenAI client
61
  client = openai.Client(api_key=api_key)
@@ -64,12 +43,13 @@ def break_summary_to_activities(text, api_key):
64
  response = client.chat.completions.create(
65
  model="gpt-4", # Use GPT-4
66
  messages=[
67
- {"role": "user", "content": f"Please break the following summary into four distinct activities, "
68
- f"formatted as 'I am [activity].' Each activity should describe a unique action "
69
- f"and be less than six words: {text}. "
70
- f"Return the four activities as a list in this "
71
- f"format: [activity1, activity2, activity3, activity4], "
72
- f"without any quotation marks or extra text."}
 
73
  ],
74
  max_tokens=150,
75
  temperature=0.7,
@@ -89,9 +69,10 @@ def analyze_writer_image(image_path, api_key):
89
  model = genai.GenerativeModel("gemini-1.5-flash")
90
  myfile = genai.upload_file(image_path)
91
  result = model.generate_content(
92
- [myfile, "\n\n",
93
  "Provide a description of the people in the picture, "
94
- "focusing on their characteristics. Keep it under five words."]
 
95
  )
96
  return result.text
97
 
@@ -127,7 +108,7 @@ def generate_video(activity_list, writer_summary, fps=24): # Lower fps
127
  os.makedirs("videos", exist_ok=True)
128
  video_paths = []
129
  for i, prompt in enumerate(prompts):
130
- video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=fps).frames
131
  video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
132
  video_paths.append(video_path)
133
 
@@ -137,3 +118,4 @@ def generate_video(activity_list, writer_summary, fps=24): # Lower fps
137
 
138
 
139
 
 
 
 
1
  import openai
2
  from google.cloud import vision
3
  import io
 
7
  from diffusers.utils import export_to_video
8
  import numpy as np
9
  import os
10
+ import spaces
11
 
12
 
13
  # Utilize the Google Cloud Vision API to recognize text in the
 
35
  return texts[0].description if texts else ''
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def break_summary_to_activities(text, api_key):
39
  # Initialize the OpenAI client
40
  client = openai.Client(api_key=api_key)
 
43
  response = client.chat.completions.create(
44
  model="gpt-4", # Use GPT-4
45
  messages=[
46
+ {"role": "user",
47
+ "content": f"Please break the following diary into exactly four most important activities. "
48
+ f"Each activity must be formatted as 'I am [activity]' and must describe only one specific action. "
49
+ f"Make sure each activity is distinct and only contains a single action (e.g., no combinations like 'eating and teaching'). "
50
+ f"Additionally, each activity should be no more than six words: {text}. "
51
+ f"Return the four activities as a list in the following format: "
52
+ f"[activity1, activity2, activity3, activity4], without any quotation marks, extra text, or explanations."}
53
  ],
54
  max_tokens=150,
55
  temperature=0.7,
 
69
  model = genai.GenerativeModel("gemini-1.5-flash")
70
  myfile = genai.upload_file(image_path)
71
  result = model.generate_content(
72
+ [myfile,
73
  "Provide a description of the people in the picture, "
74
+ "focusing on their characteristics. Keep it under five words "
75
+ "and ensure the description does not contain any line breaks, extra spaces, or unnecessary characters at the end."]
76
  )
77
  return result.text
78
 
 
108
  os.makedirs("videos", exist_ok=True)
109
  video_paths = []
110
  for i, prompt in enumerate(prompts):
111
+ video_frames = pipe(prompt, num_inference_steps=60, height=320, width=576, num_frames=fps).frames
112
  video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'videos/video{i}.mp4')
113
  video_paths.append(video_path)
114
 
 
118
 
119
 
120
 
121
+