File size: 5,490 Bytes
78263be
 
 
 
 
 
 
7907d0c
adad2af
78263be
 
7907d0c
868e527
78263be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebe5082
 
78263be
ebe5082
 
 
 
 
 
 
 
 
 
 
 
 
78263be
 
 
 
7907d0c
 
78263be
adad2af
7907d0c
78263be
 
48fe872
 
80db327
7907d0c
78263be
bef0550
 
 
 
 
 
 
 
 
 
78263be
 
adad2af
78263be
7907d0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78263be
7907d0c
78263be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import openai
from google.cloud import vision
from google.oauth2 import service_account
import io
import google.generativeai as genai
from diffusers import AutoPipelineForText2Image
import torch
import os
import spaces

# Utilize the Google Cloud Vision API to recognize text in the
# input input_images (diary input_images), https://cloud.google.com/vision.
def detect_text_in_image(image_path, credentials):

    # Create a Vision API client using the credentials
    client = vision.ImageAnnotatorClient(credentials=credentials)

    # Open the image file
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    # Create an image object for the Vision API
    image = vision.Image(content=content)

    # Use the Vision API to detect text
    response = client.text_detection(image=image)
    texts = response.text_annotations

    # Check for errors in the response
    if response.error.message:
        raise Exception(f'{response.error.message}')

    # Return the detected text or an empty string
    return texts[0].description if texts else ''


# Utilize the PaLM 2 Bison for Text model to conduct NLP tasks such as
# text summarization and condensing on the diary text, https://ai.google.dev/palm_docs/palm.
def summarize_diary_text(text, api_key):
    # Initialize the OpenAI client
    client = openai.Client(api_key=api_key)

    # Use the client to call the chat completion API
    response = client.chat.completions.create(
        model="gpt-4",  # Use GPT-4
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Summarize the following diary entry: {text}"}
        ],
        max_tokens=150,
        temperature=0.7,
        n=1  # Number of completions to generate
    )

    # Extract the summary from the response
    return response.choices[0].message.content


# Utilize the Gemini 1.0 Pro Vision to input an image of the diary writer,
# and output a textual description of the image,
# https://ai.google.dev/gemini-api/docs/models/gemini.
# Mock example assuming an API request to Gemini
def analyze_writer_image(image_path, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")
    myfile = genai.upload_file(image_path)
    color = model.generate_content(
        [myfile, "\n\n", "What is the predominant color of the person in the image? Answer in 1 word."]
    )
    description = f"""
    The writer is a cartoonish, fluffy cat with large, expressive blue eyes. 
    Its fur is predominantly {color}, with subtle shading on certain parts of its body in a slightly darker or lighter shade of {color}.
    The face is round with soft, slightly pointed ears that are highlighted with an inner coloring also in {color}.

    The most prominent feature of the cat is its extremely fluffy, oversized tail, which arcs gracefully above its body. 
    The tail fur is thick, feathery, and has a luxurious texture that stands out against the rest of the body, showcasing 
    a gradient effect from darker to lighter shades of {color} at the edges. 

    The cat’s paws are small and round, with shading in a slightly darker shade of {color}. 
    The overall look of the figure is cute, gentle, and meticulously detailed, emphasizing a soft and playful appearance.
    """
    return description #result.text


# Now that you have text from the diary and text describing the diary writer,
# you can utilize the SDXL-Turbo stable diffusion model to generate
# input_images https://huggingface.co/stabilityai/sdxl-turbo.
# You can try to output several input_images for a diary entry. Analyze how accurate the results,
# and think about what could be improved.
@spaces.GPU
def generate_comic_book(diary_text, writer_description, num_pages=4):
    pipe = AutoPipelineForText2Image.from_pretrained(
        "stabilityai/sdxl-turbo",
        torch_dtype=torch.float16,
        variant="fp16",
        cache_dir="./SDXL-Turbo"
    )

    # # Check for available device: CUDA, MPS, or CPU
    # if torch.cuda.is_available():
    #     device = "cuda"
    #     print("Using CUDA backend.")
    # elif torch.backends.mps.is_available():
    #     device = "mps"
    #     print("Using MPS backend.")
    # else:
    #     device = "cpu"
    #     print("CUDA and MPS not available. Falling back to CPU.")

    # Move the model to the selected device
    pipe.to('cuda')

    # Create a directory to store the comic book input_images
    os.makedirs("comic_book", exist_ok=True)

    # Split diary text into multiple segments/scenes for comic book pages
    diary_scenes = diary_text.split('.')[:num_pages]  # Split by periods, limiting to `num_pages`

    # Iterate over each scene, generating a page for each one
    for i, scene in enumerate(diary_scenes):
        prompt = (f'Comic Book Style: \n'
                  f'Actor Description: {writer_description} \n'
                  f'Diary Scene: {scene.strip()}\n'
                  f'Generate an cartoon image to represent this diary scene.')

        print(f"Generating comic page {i + 1} with prompt:\n{prompt}\n")

        # Generate the image
        image = pipe(prompt=prompt, num_inference_steps=30, guidance_scale=7.5).images[0]

        # Save the generated image
        image_path = f"comic_book/page_{i + 1}.png"
        image.save(image_path)
        print(f"Page {i + 1} saved as {image_path}")

    print("Comic book generation complete!")