jjz5463 commited on
Commit
7907d0c
·
1 Parent(s): 78263be
Experiments/Baseline/GUI.py CHANGED
@@ -1,24 +1,49 @@
1
  import streamlit as st
 
 
2
  from PIL import Image
 
 
3
 
4
- # You can create a web or mobile-based GUI so that users can experience your solution. Suggested libraries include https://www.gradio.app/ or https://streamlit.io/.
5
- st.title('Handwritten Diary to Cartoon Book')
6
- uploaded_diary = st.file_uploader("Upload your diary image", type=["png", "jpg", "jpeg"])
7
- uploaded_writer_image = st.file_uploader("Upload your photo", type=["png", "jpg", "jpeg"])
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  if uploaded_diary and uploaded_writer_image:
10
- st.write("Analyzing your diary...")
 
 
 
 
11
 
12
- diary_text = detect_text_in_image(uploaded_diary)
13
- summarized_text = summarize_diary_text(diary_text)
 
14
 
 
 
 
 
15
  st.write(f"Summarized Diary Text: {summarized_text}")
16
 
17
- writer_description = analyze_writer_image(uploaded_writer_image)
18
- st.write(f"Diary Writer Description: {writer_description}")
 
19
 
20
- # Generate cartoon image
21
- prompt = f"{summarized_text}, featuring a person who {writer_description}"
22
- generated_image = generate_image(prompt)
23
 
24
- st.image(generated_image, caption="Generated Cartoon Image")
 
1
  import streamlit as st
2
+ import openai
3
+ import json
4
  from PIL import Image
5
+ from google.oauth2 import service_account
6
+ from baseline_utils import detect_text_in_image, summarize_diary_text, analyze_writer_image, generate_comic_book
7
 
8
+ # Load secrets
9
+ openai_api_key = st.secrets["general"]["openai_api_key"]
10
+ google_service_account_info = json.loads(st.secrets["general"]["google_service_account"])
11
+ gemini_api_key = st.secrets["general"]["gemini_api_key"]
12
+
13
+ # Initialize OpenAI
14
+ openai.api_key = openai_api_key
15
+
16
+ # Function to get Google credentials
17
+ def get_google_credentials():
18
+ return service_account.Credentials.from_service_account_info(google_service_account_info)
19
+
20
+ st.title('Handwritten Diary to Comic Book')
21
+ uploaded_diary = st.file_uploader("Upload your handwritten diary image", type=["png", "jpg", "jpeg"])
22
+ uploaded_writer_image = st.file_uploader("Upload a photo of the writer", type=["png", "jpg", "jpeg"])
23
 
24
  if uploaded_diary and uploaded_writer_image:
25
+ st.write("Analyzing your diary and writer...")
26
+
27
+ # Read the uploaded images
28
+ diary_image = Image.open(uploaded_diary)
29
+ writer_image = Image.open(uploaded_writer_image)
30
 
31
+ # Save uploaded images temporarily (Streamlit does this automatically with file objects)
32
+ diary_image_path = uploaded_diary.name
33
+ writer_image_path = uploaded_writer_image.name
34
 
35
+ # Detect text from the diary image
36
+ google_credentials = get_google_credentials()
37
+ detected_text = detect_text_in_image(diary_image_path, google_credentials)
38
+ summarized_text = summarize_diary_text(detected_text, openai_api_key)
39
  st.write(f"Summarized Diary Text: {summarized_text}")
40
 
41
+ # Analyze the writer's image using Gemini API
42
+ writer_summary = analyze_writer_image(writer_image_path, gemini_api_key)
43
+ st.write(f"Writer Description: {writer_summary}")
44
 
45
+ # Generate the comic book based on the summaries
46
+ st.write("Generating comic book images...")
47
+ generate_comic_book(summarized_text, writer_summary, num_pages=5)
48
 
49
+ st.write("Comic book generated successfully!")
Experiments/Baseline/baseline.py CHANGED
@@ -1,12 +1,12 @@
1
  from baseline_utils import *
2
  from keys.keys import *
3
 
4
- diary_image_path = "images/test_sample.jpeg"
5
- writer_image_path = "images/writer.jpg"
6
  credentials_path = "keys/service_account_credentials.json"
7
 
8
  # Detect text from the image using the provided credentials
9
  detected_text = detect_text_in_image(diary_image_path, credentials_path)
10
  diary_summary = summarize_diary_text(detected_text, open_ai_keys)
11
  writer_summary = analyze_writer_image(writer_image_path, gemini_keys)
12
- generate_image(diary_summary, writer_summary)
 
1
  from baseline_utils import *
2
  from keys.keys import *
3
 
4
+ diary_image_path = "input_images/test_sample.jpeg"
5
+ writer_image_path = "input_images/writer.jpeg"
6
  credentials_path = "keys/service_account_credentials.json"
7
 
8
  # Detect text from the image using the provided credentials
9
  detected_text = detect_text_in_image(diary_image_path, credentials_path)
10
  diary_summary = summarize_diary_text(detected_text, open_ai_keys)
11
  writer_summary = analyze_writer_image(writer_image_path, gemini_keys)
12
+ generate_comic_book(diary_summary, writer_summary)
Experiments/Baseline/baseline_utils.py CHANGED
@@ -5,10 +5,10 @@ import io
5
  import google.generativeai as genai
6
  from diffusers import AutoPipelineForText2Image
7
  import torch
8
-
9
 
10
  # Utilize the Google Cloud Vision API to recognize text in the
11
- # input images (diary images), https://cloud.google.com/vision.
12
  def detect_text_in_image(image_path, credentials_path):
13
  # Load the service account key from the credentials JSON file
14
  credentials = service_account.Credentials.from_service_account_file(credentials_path)
@@ -66,22 +66,23 @@ def analyze_writer_image(image_path, api_key):
66
  model = genai.GenerativeModel("gemini-1.5-flash")
67
  myfile = genai.upload_file(image_path)
68
  result = model.generate_content(
69
- [myfile, "\n\n", "Can you give a textual description of the image?"]
70
  )
71
  return result.text
72
 
73
 
74
  # Now that you have text from the diary and text describing the diary writer,
75
  # you can utilize the SDXL-Turbo stable diffusion model to generate
76
- # images https://huggingface.co/stabilityai/sdxl-turbo.
77
- # You can try to output several images for a diary entry. Analyze how accurate the results,
78
  # and think about what could be improved.
79
- def generate_image(diary_text, writer_description):
80
  pipe = AutoPipelineForText2Image.from_pretrained(
81
  "stabilityai/sdxl-turbo",
82
  torch_dtype=torch.float16,
83
  variant="fp16",
84
- cache_dir="./SDXL-Turbo")
 
85
 
86
  # Check for available device: CUDA, MPS, or CPU
87
  if torch.cuda.is_available():
@@ -97,11 +98,28 @@ def generate_image(diary_text, writer_description):
97
  # Move the model to the selected device
98
  pipe = pipe.to(device)
99
 
100
- # Generate the image with a simple prompt
101
- prompt = f'Writer Description: {writer_description} \n\n Diary: {diary_text}'
102
- print(prompt)
103
- image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Save the generated image
106
- image.save("generated_image.png")
107
 
 
5
  import google.generativeai as genai
6
  from diffusers import AutoPipelineForText2Image
7
  import torch
8
+ import os
9
 
10
  # Utilize the Google Cloud Vision API to recognize text in the
11
+ # input input_images (diary input_images), https://cloud.google.com/vision.
12
  def detect_text_in_image(image_path, credentials_path):
13
  # Load the service account key from the credentials JSON file
14
  credentials = service_account.Credentials.from_service_account_file(credentials_path)
 
66
  model = genai.GenerativeModel("gemini-1.5-flash")
67
  myfile = genai.upload_file(image_path)
68
  result = model.generate_content(
69
+ [myfile, "\n\n", "Can you give a very short description of the person in the image?"]
70
  )
71
  return result.text
72
 
73
 
74
  # Now that you have text from the diary and text describing the diary writer,
75
  # you can utilize the SDXL-Turbo stable diffusion model to generate
76
+ # input_images https://huggingface.co/stabilityai/sdxl-turbo.
77
+ # You can try to output several input_images for a diary entry. Analyze how accurate the results,
78
  # and think about what could be improved.
79
+ def generate_comic_book(diary_text, writer_description, num_pages=4):
80
  pipe = AutoPipelineForText2Image.from_pretrained(
81
  "stabilityai/sdxl-turbo",
82
  torch_dtype=torch.float16,
83
  variant="fp16",
84
+ cache_dir="./SDXL-Turbo"
85
+ )
86
 
87
  # Check for available device: CUDA, MPS, or CPU
88
  if torch.cuda.is_available():
 
98
  # Move the model to the selected device
99
  pipe = pipe.to(device)
100
 
101
+ # Create a directory to store the comic book input_images
102
+ os.makedirs("comic_book", exist_ok=True)
103
+
104
+ # Split diary text into multiple segments/scenes for comic book pages
105
+ diary_scenes = diary_text.split('.')[:num_pages] # Split by periods, limiting to `num_pages`
106
+
107
+ # Iterate over each scene, generating a page for each one
108
+ for i, scene in enumerate(diary_scenes):
109
+ prompt = (f'Comic Book Style: \n'
110
+ f'Actor Description: {writer_description} \n'
111
+ f'Diary Scene: {scene.strip()}\n'
112
+ f'Generate an cartoon image to represent this diary scene.')
113
+
114
+ print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
115
+
116
+ # Generate the image
117
+ image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
118
+
119
+ # Save the generated image
120
+ image_path = f"comic_book/page_{i + 1}.png"
121
+ image.save(image_path)
122
+ print(f"Page {i + 1} saved as {image_path}")
123
 
124
+ print("Comic book generation complete!")
 
125
 
Experiments/Baseline/images/writer.jpg DELETED
Binary file (364 kB)
 
Experiments/Baseline/{images → input_images}/test_sample.jpeg RENAMED
File without changes
Experiments/Baseline/input_images/writer.jpeg ADDED