Spaces:

dielz
/

textflux-test

Runtime error

App Files Files Community

dielz commited on 18 days ago

Commit

dd09c30

verified ·

1 Parent(s): df1f5d0

Create app.py

Browse files

Files changed (1) hide show

app.py +594 -0

app.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import os
+import uuid
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from torchvision import transforms
+from diffusers import FluxFillPipeline, FluxTransformer2DModel
+from diffusers.utils import check_min_version, load_image
+WEIGHT_PATH = "dielz/textfux-test/transformer"
+# scheduler = "overshoot" # overshoot or default
+scheduler = "default"
+def read_words_from_text(input_text):
+    """
+    Reads words/list of words:
+    - If input_text is a file path, it reads all non-empty lines from the file.
+    - Otherwise, it directly splits the input by newlines into a list.
+    """
+    if isinstance(input_text, str) and os.path.exists(input_text):
+        with open(input_text, 'r', encoding='utf-8') as f:
+            words = [line.strip() for line in f if line.strip()]
+    else:
+        words = [line.strip() for line in input_text.splitlines() if line.strip()]
+    return words
+def generate_prompt(words):
+    words_str = ', '.join(f"'{word}'" for word in words)
+    prompt_template = (
+        "The pair of images highlights some white words on a black background, as well as their style on a real-world scene image. "
+        "[IMAGE1] is a template image rendering the text, with the words {words}; "
+        "[IMAGE2] shows the text content {words} naturally and correspondingly integrated into the image."
+    )
+    return prompt_template.format(words=words_str)
+prompt_template2 = (
+    "The pair of images highlights some white words on a black background, as well as their style on a real-world scene image. "
+    "[IMAGE1] is a template image rendering the text, with the words; "
+    "[IMAGE2] shows the text content naturally and correspondingly integrated into the image."
+)
+PIPE = None
+def load_flux_pipeline():
+    global PIPE
+    if PIPE is None:
+        transformer = FluxTransformer2DModel.from_pretrained(
+            WEIGHT_PATH,
+            torch_dtype=torch.bfloat16
+        )
+        PIPE = FluxFillPipeline.from_pretrained(
+            "black-forest-labs/FLUX.1-Fill-dev",
+            transformer=transformer,
+            torch_dtype=torch.bfloat16
+        ).to("cuda")
+        PIPE.transformer.to(torch.bfloat16)
+    return PIPE
+def run_inference(image_input, mask_input, words_input, num_steps=50, guidance_scale=30, seed=42):
+    """
+    Invokes the Flux model pipeline for inference:
+    - Both image_input and mask_input are required to be concatenated composite images.
+    - Automatically adjusts image dimensions to be multiples of 32 to meet model input requirements.
+    - Generates a prompt based on the word list and passes it to the pipeline for inference execution.
+    """
+    if isinstance(image_input, str):
+        inpaint_image = load_image(image_input).convert("RGB")
+    else:
+        inpaint_image = image_input.convert("RGB")
+    if isinstance(mask_input, str):
+        extended_mask = load_image(mask_input).convert("RGB")
+    else:
+        extended_mask = mask_input.convert("RGB")
+    width, height = inpaint_image.size
+    new_width = (width // 32) * 32
+    new_height = (height // 32) * 32
+    inpaint_image = inpaint_image.resize((new_width, new_height))
+    extended_mask = extended_mask.resize((new_width, new_height))
+    words = read_words_from_text(words_input)
+    prompt = generate_prompt(words)
+    print("Generated prompt:", prompt)
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5])
+    ])
+    mask_transform = transforms.Compose([
+        transforms.ToTensor()
+    ])
+    image_tensor = transform(inpaint_image)
+    mask_tensor = mask_transform(extended_mask)
+    generator = torch.Generator(device="cuda").manual_seed(int(seed))
+    pipe = load_flux_pipeline()
+    if scheduler == "overshoot":
+        try:
+            from diffusers import StochasticRFOvershotDiscreteScheduler
+            scheduler_config = pipe.scheduler.config
+            scheduler = StochasticRFOvershotDiscreteScheduler.from_config(scheduler_config)
+            overshot_func = lambda t, dt: t + dt
+            pipe.scheduler = scheduler
+            pipe.scheduler.set_c(2.0)
+            pipe.scheduler.set_overshot_func(overshot_func)
+        except ImportError:
+            print("StochasticRFOvershotDiscreteScheduler not found. Please ensure you have used the repo's diffusers.")
+            pass
+    result = pipe(
+        height=new_height,
+        width=new_width,
+        image=inpaint_image,
+        mask_image=extended_mask,
+        num_inference_steps=num_steps,
+        generator=generator,
+        max_sequence_length=512,
+        guidance_scale=guidance_scale,
+        prompt=prompt_template2,
+        prompt_2=prompt,
+    ).images[0]
+    return result
+# =============================================================================
+# Normal Mode: Direct Inference Call
+# =============================================================================
+def flux_demo_normal(image, mask, words, steps, guidance_scale, seed):
+    """
+    Gradio main function for normal mode:
+    - Directly passes the input image, mask, and word list to run_inference for inference.
+    - Returns the generated result image.
+    """
+    result = run_inference(image, mask, words, num_steps=steps, guidance_scale=guidance_scale, seed=seed)
+    return result
+# =============================================================================
+# Helper functions for both single-line and multi-line rendering
+# =============================================================================
+def extract_mask(original, drawn, threshold=30):
+    """
+    Extracts a binary mask from the original image and the user-drawn image:
+    - If 'drawn' is a dictionary and contains a "mask" key, that mask is directly binarized.
+    - Otherwise, the mask is extracted using inversion and differentiation methods.
+    """
+    if isinstance(drawn, dict):
+        if "mask" in drawn and drawn["mask"] is not None:
+            drawn_mask = np.array(drawn["mask"]).astype(np.uint8)
+            if drawn_mask.ndim == 3:
+                drawn_mask = cv2.cvtColor(drawn_mask, cv2.COLOR_RGB2GRAY)
+            _, binary_mask = cv2.threshold(drawn_mask, 50, 255, cv2.THRESH_BINARY)
+            return Image.fromarray(binary_mask).convert("RGB")
+        else:
+            drawn_img = np.array(drawn["image"]).astype(np.uint8)
+            drawn = 255 - drawn_img
+    orig_arr = np.array(original).astype(np.int16)
+    drawn_arr = np.array(drawn).astype(np.int16)
+    diff = np.abs(drawn_arr - orig_arr)
+    diff_gray = np.mean(diff, axis=-1)
+    binary_mask = (diff_gray > threshold).astype(np.uint8) * 255
+    return Image.fromarray(binary_mask).convert("RGB")
+def get_next_seq_number():
+    """
+    Finds the next available sequential number (format: 0001, 0002,...) in the 'outputs_my' directory.
+    When 'result_XXXX.png' does not exist, that number is considered available, and the formatted string XXXX is returned.
+    """
+    counter = 1
+    while True:
+        seq_str = f"{counter:04d}"
+        result_path = os.path.join("outputs_my", f"result_{seq_str}.png")
+        if not os.path.exists(result_path):
+            return seq_str
+        counter += 1
+# =============================================================================
+# Single-line text rendering functions
+# =============================================================================
+def draw_glyph_flexible(font, text, width, height, max_font_size=140):
+    """
+    Renders text horizontally centered on a canvas of specified size and returns a PIL Image.
+    Font size is automatically adjusted to fit the canvas and is limited by max_font_size.
+    """
+    img = Image.new(mode='RGB', size=(width, height), color='black')
+    if not text or not text.strip():
+        return img
+    draw = ImageDraw.Draw(img)
+    # Initial font size for calculating scale ratio
+    g_size = 50
+    try:
+        new_font = font.font_variant(size=g_size)
+    except:
+        new_font = font
+    left, top, right, bottom = new_font.getbbox(text)
+    text_width_initial = max(right - left, 1)
+    text_height_initial = max(bottom - top, 1)
+    # Calculate scale ratios based on width and height
+    width_ratio = width * 0.9 / text_width_initial
+    height_ratio = height * 0.9 / text_height_initial
+    ratio = min(width_ratio, height_ratio)
+    # Adjust maximum font size based on original image width
+    if width > 1280:
+        max_font_size = 200
+    final_font_size = int(g_size * ratio)
+    final_font_size = min(final_font_size, max_font_size)  # Apply upper limit
+    # Use the final calculated font size
+    try:
+        final_font = font.font_variant(size=max(final_font_size, 10))
+    except:
+        final_font = font
+    draw.text((width / 2, height / 2), text, font=final_font, fill='white', anchor='mm')
+    return img
+# =============================================================================
+# Multi-line text rendering functions
+# =============================================================================
+def insert_spaces(text, num_spaces):
+    """
+    Inserts a specified number of spaces between each character to adjust the spacing during text rendering.
+    """
+    if len(text) <= 1:
+        return text
+    return (' ' * num_spaces).join(list(text))
+def draw_glyph2(
+    font,
+    text,
+    polygon,
+    vertAng=10,
+    scale=1,
+    width=512,
+    height=512,
+    add_space=True,
+    scale_factor=2,
+    rotate_resample=Image.BICUBIC,
+    downsample_resample=Image.Resampling.LANCZOS
+):
+    big_w = width * scale_factor
+    big_h = height * scale_factor
+    big_polygon = polygon * scale_factor * scale
+    rect = cv2.minAreaRect(big_polygon.astype(np.float32))
+    box = cv2.boxPoints(rect)
+    box = np.intp(box)
+    w, h = rect[1]
+    angle = rect[2]
+    if angle < -45:
+        angle += 90
+    angle = -angle
+    if w < h:
+        angle += 90
+    vert = False
+    if (abs(angle) % 90 < vertAng or abs(90 - abs(angle) % 90) % 90 < vertAng):
+        _w = max(box[:, 0]) - min(box[:, 0])
+        _h = max(box[:, 1]) - min(box[:, 1])
+        if _h >= _w:
+            vert = True
+            angle = 0
+    big_img = Image.new("RGBA", (big_w, big_h), (0, 0, 0, 0))
+    tmp = Image.new("RGB", big_img.size, "white")
+    tmp_draw = ImageDraw.Draw(tmp)
+    _, _, _tw, _th = tmp_draw.textbbox((0, 0), text, font=font)
+    if _th == 0:
+        text_w = 0
+    else:
+        w_f, h_f = float(w), float(h)
+        text_w = min(w_f, h_f) * (_tw / _th)
+    if text_w <= max(w, h):
+        if len(text) > 1 and not vert and add_space:
+            for i in range(1, 100):
+                text_sp = insert_spaces(text, i)
+                _, _, tw2, th2 = tmp_draw.textbbox((0, 0), text_sp, font=font)
+                if th2 != 0:
+                    if min(w, h) * (tw2 / th2) > max(w, h):
+                        break
+            text = insert_spaces(text, i-1)
+        font_size = min(w, h) * 0.80
+    else:
+        shrink = 0.75 if vert else 0.85
+        if text_w != 0:
+            font_size = min(w, h) / (text_w / max(w, h)) * shrink
+        else:
+            font_size = min(w, h) * 0.80
+    new_font = font.font_variant(size=int(font_size))
+    left, top, right, bottom = new_font.getbbox(text)
+    text_width = right - left
+    text_height = bottom - top
+    layer = Image.new("RGBA", big_img.size, (0, 0, 0, 0))
+    draw_layer = ImageDraw.Draw(layer)
+    cx, cy = rect[0]
+    if not vert:
+        draw_layer.text(
+            (cx - text_width // 2, cy - text_height // 2 - top),
+            text,
+            font=new_font,
+            fill=(255, 255, 255, 255)
+        )
+    else:
+        _w_ = max(box[:, 0]) - min(box[:, 0])
+        x_s = min(box[:, 0]) + _w_ // 2 - text_height // 2
+        y_s = min(box[:, 1])
+        for c in text:
+            draw_layer.text((x_s, y_s), c, font=new_font, fill=(255, 255, 255, 255))
+            _, _t, _, _b = new_font.getbbox(c)
+            y_s += _b
+    rotated_layer = layer.rotate(
+        angle,
+        expand=True,
+        center=(cx, cy),
+        resample=rotate_resample
+    )
+    xo = int((big_img.width - rotated_layer.width) // 2)
+    yo = int((big_img.height - rotated_layer.height) // 2)
+    big_img.paste(rotated_layer, (xo, yo), rotated_layer)
+    final_img = big_img.resize((width, height), downsample_resample)
+    final_np = np.array(final_img)
+    return final_np
+def render_glyph_multi(original, computed_mask, texts):
+    mask_np = np.array(computed_mask.convert("L"))
+    contours, _ = cv2.findContours(mask_np, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    regions = []
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        if w * h < 50:
+            continue
+        regions.append((x, y, w, h, cnt))
+    regions = sorted(regions, key=lambda r: (r[1], r[0]))
+    render_img = Image.new("RGBA", original.size, (0, 0, 0, 0))
+    try:
+        base_font = ImageFont.truetype("resource/font/Arial-Unicode-Regular.ttf", 40)
+    except:
+        base_font = ImageFont.load_default()
+    for i, region in enumerate(regions):
+        if i >= len(texts):
+            break
+        text = texts[i].strip()
+        if not text:
+            continue
+        cnt = region[4]
+        polygon = cnt.reshape(-1, 2)
+        rendered_np = draw_glyph2(
+            font=base_font,
+            text=text,
+            polygon=polygon,
+            vertAng=10,
+            scale=1,
+            width=original.size[0],
+            height=original.size[1],
+            add_space=True,
+            scale_factor=1,
+            rotate_resample=Image.BICUBIC,
+            downsample_resample=Image.Resampling.LANCZOS
+        )
+        rendered_img = Image.fromarray(rendered_np, mode="RGBA")
+        render_img = Image.alpha_composite(render_img, rendered_img)
+    return render_img.convert("RGB")
+def choose_concat_direction(height, width):
+    """
+    Selects the concatenation direction based on the original image's aspect ratio:
+    - If height is greater than width, horizontal concatenation is used.
+    - Otherwise, vertical concatenation is used.
+    """
+    return 'horizontal' if height > width else 'vertical'
+def is_multiline_text(text):
+    """
+    Determines if the input text should be treated as multi-line based on line breaks.
+    """
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+    return len(lines) > 1
+# =============================================================================
+# Custom Mode: Unified function that handles both single-line and multi-line
+# =============================================================================
+def flux_demo_custom(original_image, drawn_mask, words, steps, guidance_scale, seed):
+    """
+    Unified custom mode Gradio main function:
+    - Automatically detects whether to use single-line or multi-line rendering based on input text
+    - If text contains line breaks, uses multi-line rendering
+    - If text is single line, uses single-line rendering
+    """
+    computed_mask = extract_mask(original_image, drawn_mask)
+    # Determine rendering mode based on text input
+    if is_multiline_text(words):
+        print("Using multi-line text rendering mode")
+        return flux_demo_custom_multiline(original_image, computed_mask, words, steps, guidance_scale, seed)
+    else:
+        print("Using single-line text rendering mode")
+        return flux_demo_custom_singleline(original_image, computed_mask, words, steps, guidance_scale, seed)
+def flux_demo_custom_multiline(original_image, computed_mask, words, steps, guidance_scale, seed):
+    """
+    Multi-line rendering mode:
+    1. Splits the user-input text into a list by line, with each line corresponding to a mask region.
+    2. Calls render_glyph_multi for each independent region to render skewed/curved text, generating a rendered image.
+    3. Selects the concatenation direction based on the original image's dimensions.
+    4. Passes the concatenated images to run_inference, returning the generated result and cropped image.
+    """
+    texts = read_words_from_text(words)
+    render_img = render_glyph_multi(original_image, computed_mask, texts)
+    width, height = original_image.size
+    empty_mask = np.zeros((height, width), dtype=np.uint8)
+    direction = choose_concat_direction(height, width)
+    if direction == 'horizontal':
+        combined_image = np.hstack((np.array(render_img), np.array(original_image)))
+        combined_mask = np.hstack((empty_mask, np.array(computed_mask.convert("L"))))
+    else:
+        combined_image = np.vstack((np.array(render_img), np.array(original_image)))
+        combined_mask = np.vstack((empty_mask, np.array(computed_mask.convert("L"))))
+    combined_mask = cv2.cvtColor(combined_mask, cv2.COLOR_GRAY2RGB)
+    composite_image = Image.fromarray(combined_image)
+    composite_mask = Image.fromarray(combined_mask)
+    result = run_inference(composite_image, composite_mask, words, num_steps=steps, guidance_scale=guidance_scale, seed=seed)
+    # Crop the result, keeping only the scene image portion.
+    width, height = result.size
+    if direction == 'horizontal':
+        cropped_result = result.crop((width // 2, 0, width, height))
+    else:
+        cropped_result = result.crop((0, height // 2, width, height))
+    save_results(result, cropped_result, computed_mask, original_image, composite_image, words)
+    return cropped_result, composite_image, composite_mask
+def flux_demo_custom_singleline(original_image, computed_mask, words, steps, guidance_scale, seed):
+    """
+    Single-line rendering mode:
+    1. Concatenates user input text into a single line.
+    2. Renders single-line text above the original image.
+    3. Calls model inference and crops the result precisely.
+    """
+    # Process text, concatenate into single line
+    text_lines = read_words_from_text(words)
+    single_line_text = ' '.join(text_lines)
+    # Calculate dimensions and generate concatenated image and mask
+    w, h = original_image.size
+    text_height_ratio = 0.15625
+    text_render_height = int(w * text_height_ratio)
+    # Load font
+    try:
+        font = ImageFont.truetype("resource/font/Arial-Unicode-Regular.ttf", 60)
+    except IOError:
+        font = ImageFont.load_default()
+        print("Warning: Font not found, using default font.")
+    # Render single-line text image
+    text_render_pil = draw_glyph_flexible(font, single_line_text, width=w, height=text_render_height)
+    # Create pure black mask with same size as text rendering
+    text_mask_pil = Image.new("RGB", text_render_pil.size, "black")
+    # Always use vertical concatenation
+    composite_image = Image.fromarray(np.vstack((np.array(text_render_pil), np.array(original_image))))
+    composite_mask = Image.fromarray(np.vstack((np.array(text_mask_pil), np.array(computed_mask))))
+    # Call model inference
+    full_result = run_inference(composite_image, composite_mask, words, num_steps=steps, guidance_scale=guidance_scale, seed=seed)
+    # Crop result proportionally, keeping only the scene image portion
+    res_w, res_h = full_result.size
+    orig_h = h  # Original scene image height
+    # Calculate crop line top edge position
+    crop_top_edge = int(res_h * (text_render_height / (orig_h + text_render_height)))
+    cropped_result = full_result.crop((0, crop_top_edge, res_w, res_h))
+    save_results(full_result, cropped_result, computed_mask, original_image, composite_image, words)
+    return cropped_result, composite_image, composite_mask
+def save_results(result, cropped_result, computed_mask, original_image, composite_image, words):
+    """
+    Save all related images and text files
+    """
+    os.makedirs("outputs_my", exist_ok=True)
+    os.makedirs("outputs_my/crop", exist_ok=True)
+    os.makedirs("outputs_my/mask", exist_ok=True)
+    os.makedirs("outputs_my/ori", exist_ok=True)
+    os.makedirs("outputs_my/composite", exist_ok=True)
+    os.makedirs("outputs_my/txt", exist_ok=True)
+    seq = get_next_seq_number()
+    result_filename = os.path.join("outputs_my", f"result_{seq}.png")
+    crop_filename = os.path.join("outputs_my", "crop", f"crop_{seq}.png")
+    mask_filename = os.path.join("outputs_my", "mask", f"mask_{seq}.png")
+    ori_filename = os.path.join("outputs_my", "ori", f"ori_{seq}.png")
+    composite_filename = os.path.join("outputs_my", "composite", f"composite_{seq}.png")
+    txt_filename = os.path.join("outputs_my", "txt", f"words_{seq}.txt")
+    # Save images
+    result.save(result_filename)
+    cropped_result.save(crop_filename)
+    computed_mask.save(mask_filename)
+    original_image.save(ori_filename)
+    composite_image.save(composite_filename)
+    with open(txt_filename, "w", encoding="utf-8") as f:
+        f.write(words)
+# =============================================================================
+# Gradio Interface
+# =============================================================================
+with gr.Blocks(title="Flux Inference Demo") as demo:
+    gr.Markdown("## Flux Inference Demo")
+    with gr.Tabs():
+        with gr.TabItem("Custom Mode"):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=350):
+                    gr.Markdown("### Image Input")
+                    original_image_custom = gr.Image(type="pil", label="Upload Original Image")
+                    gr.Markdown("### Draw Mask on Image")
+                    mask_drawing_custom = gr.Image(type="pil", label="Draw Mask on Original Image", tool="sketch")
+                with gr.Column(scale=1, min_width=350):
+                    gr.Markdown("### Parameter Settings")
+                    words_custom = gr.Textbox(
+                        lines=5,
+                        placeholder="Enter text here (single line recommended, faster and stronger).\nMultiple lines are supported, with each line rendered in corresponding mask regions.",
+                        label="Text Input"
+                    )
+                    steps_custom = gr.Slider(minimum=10, maximum=100, step=1, value=30, label="Inference Steps")
+                    guidance_scale_custom = gr.Slider(minimum=1, maximum=50, step=1, value=30, label="Guidance Scale")
+                    seed_custom = gr.Number(value=42, label="Random Seed")
+                    run_custom = gr.Button("Generate Results")
+            with gr.Tabs():
+                with gr.TabItem("Generated Results"):
+                    output_result_custom = gr.Image(type="pil", label="Generated Results")
+                with gr.TabItem("Input Preview"):
+                    output_composite_custom = gr.Image(type="pil", label="Concatenated Original Image")
+                    output_mask_custom = gr.Image(type="pil", label="Concatenated Mask")
+            original_image_custom.change(fn=lambda x: x, inputs=original_image_custom, outputs=mask_drawing_custom)
+            run_custom.click(fn=flux_demo_custom,
+                inputs=[original_image_custom, mask_drawing_custom, words_custom, steps_custom, guidance_scale_custom, seed_custom],
+                outputs=[output_result_custom, output_composite_custom, output_mask_custom])
+        with gr.TabItem("Normal Mode"):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=350):
+                    gr.Markdown("### Image Input")
+                    image_normal = gr.Image(type="pil", label="Image Input")
+                    gr.Markdown("### Mask Input")
+                    mask_normal = gr.Image(type="pil", label="Mask Input")
+                with gr.Column(scale=1, min_width=350):
+                    gr.Markdown("### Parameter Settings")
+                    words_normal = gr.Textbox(lines=5, placeholder="Please enter words here, one per line", label="Text List")
+                    steps_normal = gr.Slider(minimum=10, maximum=100, step=1, value=30, label="Inference Steps")
+                    guidance_scale_normal = gr.Slider(minimum=1, maximum=50, step=1, value=30, label="Guidance Scale")
+                    seed_normal = gr.Number(value=42, label="Random Seed")
+                    run_normal = gr.Button("Generate Results")
+                    output_normal = gr.Image(type="pil", label="Generated Results")
+            run_normal.click(fn=flux_demo_normal,
+                inputs=[image_normal, mask_normal, words_normal, steps_normal, guidance_scale_normal, seed_normal],
+                outputs=output_normal)
+    gr.Markdown(
+        """
+        ### Instructions
+        - **Custom Mode**:
+          - Upload an original image, then draw a mask on it
+          - **Single-line mode**: Enter text without line breaks - all text will be joined and rendered as one line above the image
+          - **Multi-line mode**: Enter text with line breaks - each line will be rendered in the corresponding mask region with skewed/curved effects
+          - The system automatically detects which mode to use based on your text input
+        - **Normal Mode**: Directly upload an image, mask, and a list of words to generate the result image.
+        """
+    )
+if __name__ == "__main__":
+    check_min_version("0.30.1")
+    demo.launch()