File size: 5,490 Bytes
78263be 7907d0c adad2af 78263be 7907d0c 868e527 78263be ebe5082 78263be ebe5082 78263be 7907d0c 78263be adad2af 7907d0c 78263be 48fe872 80db327 7907d0c 78263be bef0550 78263be adad2af 78263be 7907d0c 78263be 7907d0c 78263be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import openai
from google.cloud import vision
from google.oauth2 import service_account
import io
import google.generativeai as genai
from diffusers import AutoPipelineForText2Image
import torch
import os
import spaces
# Utilize the Google Cloud Vision API to recognize text in the
# input input_images (diary input_images), https://cloud.google.com/vision.
def detect_text_in_image(image_path, credentials):
# Create a Vision API client using the credentials
client = vision.ImageAnnotatorClient(credentials=credentials)
# Open the image file
with io.open(image_path, 'rb') as image_file:
content = image_file.read()
# Create an image object for the Vision API
image = vision.Image(content=content)
# Use the Vision API to detect text
response = client.text_detection(image=image)
texts = response.text_annotations
# Check for errors in the response
if response.error.message:
raise Exception(f'{response.error.message}')
# Return the detected text or an empty string
return texts[0].description if texts else ''
# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
def summarize_diary_text(text, api_key):
# Initialize the OpenAI client
client = openai.Client(api_key=api_key)
# Use the client to call the chat completion API
response = client.chat.completions.create(
model="gpt-4", # Use GPT-4
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Summarize the following diary entry: {text}"}
],
max_tokens=150,
temperature=0.7,
n=1 # Number of completions to generate
)
# Extract the summary from the response
return response.choices[0].message.content
# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
# and output a textual description of the image,
# https://ai.google.dev/gemini-api/docs/models/gemini.
# Mock example assuming an API request to Gemini
def analyze_writer_image(image_path, api_key):
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")
myfile = genai.upload_file(image_path)
color = model.generate_content(
[myfile, "\n\n", "What is the predominant color of the person in the image? Answer in 1 word."]
)
description = f"""
The writer is a cartoonish, fluffy cat with large, expressive blue eyes.
Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.
The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body.
The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing
a gradient effect from darker to lighter shades of {color} at the edges.
The cat’s paws are small and round, with shading in a slightly darker shade of {color}.
The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
"""
return description #result.text
# Now that you have text from the diary and text describing the diary writer,
# you can utilize the SDXL-Turbo stable diffusion model to generate
# input_images https://huggingface.co/stabilityai/sdxl-turbo.
# You can try to output several input_images for a diary entry. Analyze how accurate the results,
# and think about what could be improved.
@spaces.GPU
def generate_comic_book(diary_text, writer_description, num_pages=4):
pipe = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo",
torch_dtype=torch.float16,
variant="fp16",
cache_dir="./SDXL-Turbo"
)
# # Check for available device: CUDA, MPS, or CPU
# if torch.cuda.is_available():
# device = "cuda"
# print("Using CUDA backend.")
# elif torch.backends.mps.is_available():
# device = "mps"
# print("Using MPS backend.")
# else:
# device = "cpu"
# print("CUDA and MPS not available. Falling back to CPU.")
# Move the model to the selected device
pipe.to('cuda')
# Create a directory to store the comic book input_images
os.makedirs("comic_book", exist_ok=True)
# Split diary text into multiple segments/scenes for comic book pages
diary_scenes = diary_text.split('.')[:num_pages] # Split by periods, limiting to `num_pages`
# Iterate over each scene, generating a page for each one
for i, scene in enumerate(diary_scenes):
prompt = (f'Comic Book Style: \n'
f'Actor Description: {writer_description} \n'
f'Diary Scene: {scene.strip()}\n'
f'Generate an cartoon image to represent this diary scene.')
print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")
# Generate the image
image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
# Save the generated image
image_path = f"comic_book/page_{i + 1}.png"
image.save(image_path)
print(f"Page {i + 1} saved as {image_path}")
print("Comic book generation complete!")
|