jjz5463 commited on
Commit
f16ac55
·
1 Parent(s): f921536

initial commit

Browse files
Files changed (4) hide show
  1. README.md +0 -2
  2. app.py +65 -0
  3. baseline_utils.py +169 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -8,5 +8,3 @@ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  ---
 
 
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import json
4
+ from PIL import Image
5
+ from google.oauth2 import service_account
6
+ from baseline_utils import detect_text_in_image, summarize_diary_text, analyze_writer_image, generate_video
7
+ import os
8
+ from keys.keys import *
9
+
10
+ # Load secrets from the environment or other sources (adjust as needed)
11
+ openai_api_key = open_ai_keys
12
+ with open('keys/service_account_credentials.json') as f:
13
+ google_service_account_info = json.load(f)
14
+ gemini_api_key = gemini_keys
15
+
16
+ # Initialize OpenAI
17
+ openai.api_key = openai_api_key
18
+
19
+ # Function to get Google credentials
20
+ def get_google_credentials():
21
+ return service_account.Credentials.from_service_account_info(google_service_account_info)
22
+
23
+ def process_images(diary_image, writer_image):
24
+ # Save the file-like objects as image files
25
+ diary_image_path = "temp_upload_images/temp_diary_image.png"
26
+ writer_image_path = "temp_upload_images/temp_writer_image.png"
27
+ os.makedirs("temp_upload_images", exist_ok=True)
28
+ diary_image.save(diary_image_path)
29
+ writer_image.save(writer_image_path)
30
+
31
+ # Detect text from the diary image
32
+ google_credentials = get_google_credentials()
33
+ detected_text = detect_text_in_image(diary_image_path, google_credentials)
34
+ summarized_text = summarize_diary_text(detected_text, openai_api_key)
35
+
36
+ # Analyze the writer's image using Gemini API
37
+ writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
38
+
39
+ # Generate the video based on the summaries
40
+ video_path = generate_video(summarized_text, writer_summary, fps=24)
41
+
42
+ return video_path
43
+
44
+
45
+ # Define the Gradio interface
46
+ def gradio_interface(diary_image, writer_image):
47
+ # Process the images and generate the video
48
+ generated_video = process_images(diary_image, writer_image)
49
+
50
+ # Return the path to the generated video
51
+ return generated_video
52
+
53
+ # Set up the Gradio interface
54
+ interface = gr.Interface(
55
+ fn=gradio_interface,
56
+ inputs=[
57
+ gr.Image(label="Upload your handwritten diary image", type="pil"),
58
+ gr.Image(label="Upload a photo of the writer", type="pil"),
59
+ ],
60
+ outputs=gr.Video(label="Generated Video"),
61
+ title="Handwritten Diary to Video"
62
+ )
63
+
64
+ # Launch the interface
65
+ interface.launch()
baseline_utils.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from google.cloud import vision
3
+ from google.oauth2 import service_account
4
+ import io
5
+ import google.generativeai as genai
6
+ from diffusers import AutoPipelineForText2Image, DiffusionPipeline
7
+ import torch
8
+ import os
9
+ from moviepy.editor import ImageSequenceClip
10
+ from diffusers.utils import export_to_video
11
+
12
+ # Utilize the Google Cloud Vision API to recognize text in the
13
+ # input input_images (diary input_images), https://cloud.google.com/vision.
14
+ def detect_text_in_image(image_path, credentials):
15
+
16
+ # Create a Vision API client using the credentials
17
+ client = vision.ImageAnnotatorClient(credentials=credentials)
18
+
19
+ # Open the image file
20
+ with io.open(image_path, 'rb') as image_file:
21
+ content = image_file.read()
22
+
23
+ # Create an image object for the Vision API
24
+ image = vision.Image(content=content)
25
+
26
+ # Use the Vision API to detect text
27
+ response = client.text_detection(image=image)
28
+ texts = response.text_annotations
29
+
30
+ # Check for errors in the response
31
+ if response.error.message:
32
+ raise Exception(f'{response.error.message}')
33
+
34
+ # Return the detected text or an empty string
35
+ return texts[0].description if texts else ''
36
+
37
+
38
+ # Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
39
+ # text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
40
+ def summarize_diary_text(text, api_key):
41
+ # Initialize the OpenAI client
42
+ client = openai.Client(api_key=api_key)
43
+
44
+ # Use the client to call the chat completion API
45
+ response = client.chat.completions.create(
46
+ model="gpt-4", # Use GPT-4
47
+ messages=[
48
+ {"role": "system", "content": "You are a helpful assistant."},
49
+ {"role": "user", "content": f"Summarize the following diary entry: {text}"}
50
+ ],
51
+ max_tokens=150,
52
+ temperature=0.7,
53
+ n=1 # Number of completions to generate
54
+ )
55
+
56
+ # Extract the summary from the response
57
+ return response.choices[0].message.content
58
+
59
+
60
+ # Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
61
+ # and output a textual description of the image,
62
+ # https://ai.google.dev/gemini-api/docs/models/gemini.
63
+ # Mock example assuming an API request to Gemini
64
+ def analyze_writer_image(image_path, api_key):
65
+ genai.configure(api_key=api_key)
66
+ model = genai.GenerativeModel("gemini-1.5-flash")
67
+ myfile = genai.upload_file(image_path)
68
+ result = model.generate_content(
69
+ [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
70
+ )
71
+ return result.text
72
+
73
+
74
+ # Now that you have text from the diary and text describing the diary writer,
75
+ # you can utilize the SDXL-Turbo stable diffusion model to generate
76
+ # input_images https://huggingface.co/stabilityai/sdxl-turbo.
77
+ # You can try to output several input_images for a diary entry. Analyze how accurate the results,
78
+ # and think about what could be improved.
79
+ # def generate_comic_book(diary_text, writer_description, num_pages=4):
80
+ # pipe = AutoPipelineForText2Image.from_pretrained(
81
+ # "stabilityai/sdxl-turbo",
82
+ # torch_dtype=torch.float16,
83
+ # variant="fp16",
84
+ # cache_dir="./SDXL-Turbo"
85
+ # )
86
+ #
87
+ # # Check for available device: CUDA, MPS, or CPU
88
+ # if torch.cuda.is_available():
89
+ # device = "cuda"
90
+ # print("Using CUDA backend.")
91
+ # elif torch.backends.mps.is_available():
92
+ # device = "mps"
93
+ # print("Using MPS backend.")
94
+ # else:
95
+ # device = "cpu"
96
+ # print("CUDA and MPS not available. Falling back to CPU.")
97
+ #
98
+ # # Move the model to the selected device
99
+ # pipe = pipe.to(device)
100
+ #
101
+ # # Create a directory to store the comic book input_images
102
+ # os.makedirs("comic_book", exist_ok=True)
103
+ #
104
+ # # Split diary text into multiple segments/scenes for comic book pages
105
+ # diary_scenes = diary_text.split('.')[:num_pages] # Split by periods, limiting to `num_pages`
106
+ #
107
+ # # Iterate over each scene, generating a page for each one
108
+ # for i, scene in enumerate(diary_scenes):
109
+ # prompt = (f'Comic Book Style: \n'
110
+ # f'Actor Description: {writer_description} \n'
111
+ # f'Diary Scene: {scene.strip()}\n'
112
+ # f'Generate an cartoon image to represent this diary scene.')
113
+ #
114
+ # print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
115
+ #
116
+ # # Generate the image
117
+ # image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
118
+ #
119
+ # # Save the generated image
120
+ # image_path = f"comic_book/page_{i + 1}.png"
121
+ # image.save(image_path)
122
+ # print(f"Page {i + 1} saved as {image_path}")
123
+ #
124
+ # print("Comic book generation complete!")
125
+
126
+
127
+ def truncate_prompt(prompt, max_tokens=77):
128
+ tokens = prompt.split()
129
+ if len(tokens) > max_tokens:
130
+ return " ".join(tokens[:max_tokens])
131
+ return prompt
132
+
133
+
134
+ def generate_video(diary_text, writer_description, fps=15): # Lower fps
135
+ # Load the Zeroscope video generation model
136
+ pipe = DiffusionPipeline.from_pretrained(
137
+ "cerspense/zeroscope_v2_576w", # Zeroscope model from Hugging Face
138
+ torch_dtype=torch.float16
139
+ )
140
+
141
+ # Check for available device: CUDA, MPS, or CPU
142
+ device = "cuda" if torch.cuda.is_available() else "cpu"
143
+ pipe = pipe.to(device)
144
+
145
+ # Define the total number of frames needed for a 15-second video at the given fps
146
+ total_frames = 15 * fps
147
+
148
+ # Combine the diary text and writer description for a cohesive prompt
149
+ prompt = (f"Actor Description: {writer_description}\n"
150
+ f"Diary Scene: {diary_text.strip()}\n"
151
+ f"Generate a 15-second video based on this scene.")
152
+
153
+ # Truncate the prompt to fit the CLIP token limit
154
+ prompt = truncate_prompt(prompt)
155
+
156
+ # Generate the video frames
157
+ video_frames = pipe(
158
+ prompt=prompt,
159
+ num_inference_steps=25, # Reduce inference steps
160
+ height=320, # Lower the resolution to save memory
161
+ width=576,
162
+ num_frames=total_frames
163
+ ).frames
164
+
165
+ # Save the video
166
+ video_path = export_to_video(video_frames)
167
+ print(f"Video generation complete! Saved as {video_path}")
168
+
169
+ return video_path
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ google-cloud-vision
3
+ google-auth
4
+ google-generativeai
5
+ diffusers
6
+ torch
7
+ streamlit
8
+ transformers
9
+ accelerate
10
+ moviepy