jjz5463 commited on
Commit
943206a
·
1 Parent(s): 2f3080f

4 videos instead of 1

Browse files
Files changed (2) hide show
  1. app.py +39 -17
  2. baseline_utils.py +54 -93
app.py CHANGED
@@ -1,9 +1,12 @@
1
  import gradio as gr
2
  import openai
3
  import json
4
- from PIL import Image
5
  from google.oauth2 import service_account
6
- from baseline_utils import detect_text_in_image, summarize_diary_text, analyze_writer_image, generate_video
 
 
 
 
7
  import os
8
 
9
  # Load secrets from Hugging Face Spaces environment
@@ -14,10 +17,12 @@ gemini_api_key = os.getenv("GEMINI_API_KEY")
14
  # Initialize OpenAI
15
  openai.api_key = openai_api_key
16
 
 
17
  # Function to get Google credentials
18
  def get_google_credentials():
19
  return service_account.Credentials.from_service_account_info(google_service_account_info)
20
 
 
21
  def process_images(diary_image, writer_image):
22
  # Save the file-like objects as image files
23
  diary_image_path = "temp_upload_images/temp_diary_image.png"
@@ -30,34 +35,51 @@ def process_images(diary_image, writer_image):
30
  google_credentials = get_google_credentials()
31
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
32
  summarized_text = summarize_diary_text(detected_text, openai_api_key)
 
 
33
 
34
  # Analyze the writer's image using Gemini API
35
  writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
36
 
37
  # Generate the video based on the summaries
38
- video_path = generate_video(summarized_text, writer_summary, fps=24)
39
 
40
- return video_path
41
 
42
 
43
  # Define the Gradio interface
44
  def gradio_interface(diary_image, writer_image):
45
- # Process the images and generate the video
46
- generated_video = process_images(diary_image, writer_image)
 
 
 
 
47
 
48
- # Return the path to the generated video
49
- return generated_video
50
 
51
  # Set up the Gradio interface
52
- interface = gr.Interface(
53
- fn=gradio_interface,
54
- inputs=[
55
- gr.Image(label="Upload your handwritten diary image", type="pil"),
56
- gr.Image(label="Upload a photo of the writer", type="pil"),
57
- ],
58
- outputs=gr.Video(label="Generated Video"),
59
- title="Handwritten Diary to Video"
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # Launch the interface
63
  interface.launch()
 
1
  import gradio as gr
2
  import openai
3
  import json
 
4
  from google.oauth2 import service_account
5
+ from baseline_utils import (detect_text_in_image,
6
+ summarize_diary_text,
7
+ analyze_writer_image,
8
+ generate_video,
9
+ break_summary_to_activities)
10
  import os
11
 
12
  # Load secrets from Hugging Face Spaces environment
 
17
  # Initialize OpenAI
18
  openai.api_key = openai_api_key
19
 
20
+
21
  # Function to get Google credentials
22
  def get_google_credentials():
23
  return service_account.Credentials.from_service_account_info(google_service_account_info)
24
 
25
+
26
  def process_images(diary_image, writer_image):
27
  # Save the file-like objects as image files
28
  diary_image_path = "temp_upload_images/temp_diary_image.png"
 
35
  google_credentials = get_google_credentials()
36
  detected_text = detect_text_in_image(diary_image_path, google_credentials)
37
  summarized_text = summarize_diary_text(detected_text, openai_api_key)
38
+ activities = break_summary_to_activities(summarized_text, openai_api_key)
39
+ activity_list = activities.strip('[]').split(', ')
40
 
41
  # Analyze the writer's image using Gemini API
42
  writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
43
 
44
  # Generate the video based on the summaries
45
+ video_paths = generate_video(activity_list, writer_summary, fps=24)
46
 
47
+ return video_paths
48
 
49
 
50
  # Define the Gradio interface
51
  def gradio_interface(diary_image, writer_image):
52
+ # Process the images and generate the videos
53
+ video_generator = process_images(diary_image, writer_image)
54
+
55
+ # Use streaming to return each video path as it's ready
56
+ for video_path in video_generator:
57
+ yield video_path
58
 
 
 
59
 
60
  # Set up the Gradio interface
61
+ with gr.Blocks() as interface:
62
+ gr.Markdown("# Handwritten Diary to Video")
63
+
64
+ with gr.Row():
65
+ diary_image_input = gr.Image(label="Upload your handwritten diary image", type="pil")
66
+ writer_image_input = gr.Image(label="Upload a photo of the writer", type="pil")
67
+
68
+ submit_button = gr.Button("Generate Videos")
69
+
70
+ with gr.Row():
71
+ with gr.Column():
72
+ video_output_1 = gr.Video(label="Generated Video 1")
73
+ video_output_2 = gr.Video(label="Generated Video 2")
74
+ with gr.Column():
75
+ video_output_3 = gr.Video(label="Generated Video 3")
76
+ video_output_4 = gr.Video(label="Generated Video 4")
77
+
78
+ # Use streaming=True to display each video as soon as it's ready
79
+ submit_button.click(fn=gradio_interface,
80
+ inputs=[diary_image_input, writer_image_input],
81
+ outputs=[video_output_1, video_output_2, video_output_3, video_output_4],
82
+ stream=True)
83
 
84
  # Launch the interface
85
  interface.launch()
baseline_utils.py CHANGED
@@ -1,20 +1,18 @@
1
  import openai
2
  from google.cloud import vision
3
- from google.oauth2 import service_account
4
  import io
5
  import google.generativeai as genai
6
- from diffusers import AutoPipelineForText2Image, DiffusionPipeline
7
  import torch
8
- import os
9
- from moviepy.editor import ImageSequenceClip
10
  from diffusers.utils import export_to_video
 
11
  import spaces
12
 
 
13
  # Utilize the Google Cloud Vision API to recognize text in the
14
  # input input_images (diary input_images), https://cloud.google.com/vision.
15
  def detect_text_in_image(image_path, credentials):
16
 
17
- # Create a Vision API client using the credentials
18
  client = vision.ImageAnnotatorClient(credentials=credentials)
19
 
20
  # Open the image file
@@ -46,7 +44,6 @@ def summarize_diary_text(text, api_key):
46
  response = client.chat.completions.create(
47
  model="gpt-4", # Use GPT-4
48
  messages=[
49
- {"role": "system", "content": "You are a helpful assistant."},
50
  {"role": "user", "content": f"Summarize the following diary entry: {text}"}
51
  ],
52
  max_tokens=150,
@@ -58,6 +55,30 @@ def summarize_diary_text(text, api_key):
58
  return response.choices[0].message.content
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
62
  # and output a textual description of the image,
63
  # https://ai.google.dev/gemini-api/docs/models/gemini.
@@ -67,104 +88,44 @@ def analyze_writer_image(image_path, api_key):
67
  model = genai.GenerativeModel("gemini-1.5-flash")
68
  myfile = genai.upload_file(image_path)
69
  result = model.generate_content(
70
- [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
 
 
71
  )
72
  return result.text
73
 
74
 
75
- # Now that you have text from the diary and text describing the diary writer,
76
- # you can utilize the SDXL-Turbo stable diffusion model to generate
77
- # input_images https://huggingface.co/stabilityai/sdxl-turbo.
78
- # You can try to output several input_images for a diary entry. Analyze how accurate the results,
79
- # and think about what could be improved.
80
- # def generate_comic_book(diary_text, writer_description, num_pages=4):
81
- # pipe = AutoPipelineForText2Image.from_pretrained(
82
- # "stabilityai/sdxl-turbo",
83
- # torch_dtype=torch.float16,
84
- # variant="fp16",
85
- # cache_dir="./SDXL-Turbo"
86
- # )
87
- #
88
- # # Check for available device: CUDA, MPS, or CPU
89
- # if torch.cuda.is_available():
90
- # device = "cuda"
91
- # print("Using CUDA backend.")
92
- # elif torch.backends.mps.is_available():
93
- # device = "mps"
94
- # print("Using MPS backend.")
95
- # else:
96
- # device = "cpu"
97
- # print("CUDA and MPS not available. Falling back to CPU.")
98
- #
99
- # # Move the model to the selected device
100
- # pipe = pipe.to(device)
101
- #
102
- # # Create a directory to store the comic book input_images
103
- # os.makedirs("comic_book", exist_ok=True)
104
- #
105
- # # Split diary text into multiple segments/scenes for comic book pages
106
- # diary_scenes = diary_text.split('.')[:num_pages] # Split by periods, limiting to `num_pages`
107
- #
108
- # # Iterate over each scene, generating a page for each one
109
- # for i, scene in enumerate(diary_scenes):
110
- # prompt = (f'Comic Book Style: \n'
111
- # f'Actor Description: {writer_description} \n'
112
- # f'Diary Scene: {scene.strip()}\n'
113
- # f'Generate an cartoon image to represent this diary scene.')
114
- #
115
- # print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
116
- #
117
- # # Generate the image
118
- # image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
119
- #
120
- # # Save the generated image
121
- # image_path = f"comic_book/page_{i + 1}.png"
122
- # image.save(image_path)
123
- # print(f"Page {i + 1} saved as {image_path}")
124
- #
125
- # print("Comic book generation complete!")
126
-
127
-
128
- def truncate_prompt(prompt, max_tokens=77):
129
- tokens = prompt.split()
130
- if len(tokens) > max_tokens:
131
- return " ".join(tokens[:max_tokens])
132
- return prompt
133
-
134
  @spaces.GPU
135
- def generate_video(diary_text, writer_description, fps=15): # Lower fps
136
  # Load the Zeroscope video generation model
137
  pipe = DiffusionPipeline.from_pretrained(
138
  "cerspense/zeroscope_v2_576w", # Zeroscope model from Hugging Face
139
- torch_dtype=torch.float16
 
140
  )
141
 
142
  # Check for available device: CUDA, MPS, or CPU
143
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
144
  pipe = pipe.to(device)
145
 
146
- # Define the total number of frames needed for a 15-second video at the given fps
147
- total_frames = 10 * fps
148
-
149
  # Combine the diary text and writer description for a cohesive prompt
150
- prompt = (f"Actor Description: {writer_description}\n"
151
- f"Diary Scene: {diary_text.strip()}\n"
152
- f"Generate a 15-second video based on this scene.")
153
-
154
- # Truncate the prompt to fit the CLIP token limit
155
- prompt = truncate_prompt(prompt)
156
-
157
- # Generate the video frames
158
- video_frames = pipe(
159
- prompt=prompt,
160
- num_inference_steps=1, # Minimum inference steps to reduce computation
161
- height=128, # Reduce resolution as much as possible
162
- width=128,
163
- num_frames=total_frames # Number of frames stays the same to keep video length
164
- ).frames
165
-
166
- # Save the video
167
- video_path = export_to_video(video_frames)
168
- print(f"Video generation complete! Saved as {video_path}")
169
-
170
- return video_path
 
1
  import openai
2
  from google.cloud import vision
 
3
  import io
4
  import google.generativeai as genai
5
+ from diffusers import DiffusionPipeline
6
  import torch
 
 
7
  from diffusers.utils import export_to_video
8
+ import numpy as np
9
  import spaces
10
 
11
+
12
  # Utilize the Google Cloud Vision API to recognize text in the
13
  # input input_images (diary input_images), https://cloud.google.com/vision.
14
  def detect_text_in_image(image_path, credentials):
15
 
 
16
  client = vision.ImageAnnotatorClient(credentials=credentials)
17
 
18
  # Open the image file
 
44
  response = client.chat.completions.create(
45
  model="gpt-4", # Use GPT-4
46
  messages=[
 
47
  {"role": "user", "content": f"Summarize the following diary entry: {text}"}
48
  ],
49
  max_tokens=150,
 
55
  return response.choices[0].message.content
56
 
57
 
58
+ def break_summary_to_activities(text, api_key):
59
+ # Initialize the OpenAI client
60
+ client = openai.Client(api_key=api_key)
61
+
62
+ # Use the client to call the chat completion API
63
+ response = client.chat.completions.create(
64
+ model="gpt-4", # Use GPT-4
65
+ messages=[
66
+ {"role": "user", "content": f"Please break the following summary into four distinct activities, "
67
+ f"formatted as 'I am [activity].' Each activity should describe a unique action "
68
+ f"and be less than six words: {text}. "
69
+ f"Return the four activities as a list in this "
70
+ f"format: [activity1, activity2, activity3, activity4], "
71
+ f"without any quotation marks or extra text."}
72
+ ],
73
+ max_tokens=150,
74
+ temperature=0.7,
75
+ n=1 # Number of completions to generate
76
+ )
77
+
78
+ # Extract the summary from the response
79
+ return response.choices[0].message.content
80
+
81
+
82
  # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
83
  # and output a textual description of the image,
84
  # https://ai.google.dev/gemini-api/docs/models/gemini.
 
88
  model = genai.GenerativeModel("gemini-1.5-flash")
89
  myfile = genai.upload_file(image_path)
90
  result = model.generate_content(
91
+ [myfile, "\n\n",
92
+ "Provide a description of the people in the picture, "
93
+ "focusing on their characteristics. Keep it under five words."]
94
  )
95
  return result.text
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  @spaces.GPU
99
+ def generate_video(activity_list, writer_summary, fps=24):
100
  # Load the Zeroscope video generation model
101
  pipe = DiffusionPipeline.from_pretrained(
102
  "cerspense/zeroscope_v2_576w", # Zeroscope model from Hugging Face
103
+ torch_dtype=torch.float16,
104
+ cache_dir="./zeroscope"
105
  )
106
 
107
  # Check for available device: CUDA, MPS, or CPU
108
+ if torch.cuda.is_available():
109
+ device = "cuda"
110
+ print("Using CUDA backend.")
111
+ elif torch.backends.mps.is_available():
112
+ device = "mps"
113
+ print("Using MPS backend.")
114
+ else:
115
+ device = "cpu"
116
+ print("CUDA and MPS not available. Falling back to CPU.")
117
  pipe = pipe.to(device)
118
 
 
 
 
119
  # Combine the diary text and writer description for a cohesive prompt
120
+ prompts = []
121
+ for activity in activity_list:
122
+ prompt = writer_summary.strip('.').capitalize() + ' is' + activity[4:]
123
+ prompts.append(prompt)
124
+
125
+ # Stream video results as soon as they are ready
126
+ for i, prompt in enumerate(prompts):
127
+ video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=fps).frames
128
+ video_path = export_to_video(np.squeeze(video_frames, axis=0), output_video_path=f'video_{i + 1}.mp4')
129
+
130
+ # Yield the path for each video
131
+ yield video_path