Spaces:

arj7192
/

NativDemo

Building on Zero

App Files Files Community

arj7192 commited on Feb 10

Commit

36f24b2

verified ·

1 Parent(s): 264c783

Delete fonts/app.py

Browse files

Files changed (1) hide show

fonts/app.py +0 -575

fonts/app.py DELETED Viewed

@@ -1,575 +0,0 @@
-import spaces
-import os
-import gradio as gr
-import easyocr
-import numpy as np
-import cv2
-import base64
-import torch
-from shapely import Polygon
-from ultralytics import YOLO
-from io import BytesIO
-from openai import OpenAI
-from PIL import Image, ImageDraw, ImageFont
-from diffusers.utils import load_image, check_min_version
-from controlnet_flux import FluxControlNetModel
-from transformer_flux import FluxTransformer2DModel
-from pipeline_flux_controlnet_inpaint import FluxControlNetInpaintingPipeline
-import huggingface_hub
-huggingface_hub.login(os.getenv('HF_TOKEN_FLUX'))
-bubble_detection_model = YOLO("speech_bubble_model.pt")
-language_to_ocr = {
-    'Simplified Chinese': 'ch_sim',
-    'Traditional Chinese': 'ch_tra',
-    'Korean': 'ko',
-    'Japanese': 'ja',
-    'English': 'en',
-}
-OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-MARKDOWN = """
-# Made by Nativ
-"""
-check_min_version("0.30.2")
-transformer = FluxTransformer2DModel.from_pretrained(
-        "black-forest-labs/FLUX.1-dev", subfolder='transformer', torch_dytpe=torch.bfloat16
-    )
-cuda_device =torch.device("cuda")
-# Build pipeline
-controlnet = FluxControlNetModel.from_pretrained("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", torch_dtype=torch.bfloat16)
-pipe = FluxControlNetInpaintingPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    controlnet=controlnet,
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
-).to(cuda_device)
-pipe.transformer.to(torch.bfloat16)
-pipe.controlnet.to(torch.bfloat16)
-def localize_boxes(merged_results, img_boxes, source_language, target_language):
-    # Convert image to base64
-    buffered = BytesIO()
-    img_boxes.save(buffered, format="PNG")
-    img_str = base64.b64encode(buffered.getvalue()).decode()
-    print(merged_results)
-    prompt = f"""You are an expert translator and localization specialist with deep understanding of both {source_language} and {target_language} cultures.
-    Task: Translate the detected text while preserving the cultural context and maintaining visual harmony. Make the results in capital letters.
-    Source Text and Coordinates:
-    {merged_results}
-    Requirements:
-    1. Maintain the original meaning and tone while adapting to {target_language} cultural context
-    2. Keep translations concise and visually balanced (similar character length when possible)
-    3. Preserve any:
-    - Brand names
-    - Product names
-    - Technical terms
-    - Numbers and units
-    4. Consider the visual context from the provided image
-    5. Use appropriate formality level for {target_language}
-    6. Maintain any special formatting (if present)
-    Format your response EXACTLY as a JSON-like list of dictionaries. Keep the box coordinates EXACTLY as they are, do not change them, only translate the text.
-    [{{'box': [[x0, y0], [x1, y0], [x1, y1], [x0, y1]], 'text': 'translated_text'}}]
-    Important: Only output the JSON format above, no explanations or additional text."""
-    client = OpenAI(api_key=OPENAI_API_KEY)
-    response = client.chat.completions.create(
-        model="gpt-4o",
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{img_str}"
-                        }
-                    }
-                ]
-            }
-        ],
-        max_tokens=1000,
-        temperature=0
-    )
-    try:
-        translation_text = response.choices[0].message.content
-        translation_text = translation_text.replace("```json", "").replace("```", "").strip()
-        translated_results = eval(translation_text)
-        return translated_results
-    except Exception as e:
-        print(f"Error parsing GPT-4o response: {e}")
-        return merged_results
-def merge_boxes(boxes, image_shape, distance_threshold=10):
-    """Merge boxes that are close to each other and return their associated text"""
-    if not boxes:
-        return []
-    # Extract boxes and create mapping to original data
-    boxes_only = [box[0] for box in boxes]
-    texts = [box[1] for box in boxes]  # Extract the text content
-    # Create a binary mask of all boxes
-    height, width = image_shape[:2]
-    mask = np.zeros((height, width), dtype=np.uint8)
-    # Draw all boxes on mask and create a mapping of pixel positions to box indices
-    box_indices_map = {}  # Will store which original box each pixel belongs to
-    for idx, coords in enumerate(boxes_only):
-        pts = np.array(coords, dtype=np.int32)
-        cv2.fillPoly(mask, [pts], 255)
-        # Store the indices of boxes for each filled pixel
-        y_coords, x_coords = np.where(mask == 255)
-        for y, x in zip(y_coords, x_coords):
-            if (y, x) not in box_indices_map:
-                box_indices_map[(y, x)] = []
-            box_indices_map[(y, x)].append(idx)
-    # Dilate to connect nearby components
-    kernel = np.ones((distance_threshold, distance_threshold), np.uint8)
-    dilated = cv2.dilate(mask, kernel, iterations=1)
-    # Find connected components
-    num_labels, labels = cv2.connectedComponents(dilated)
-    # Create new merged boxes with their associated text
-    merged_results = []
-    for label in range(1, num_labels):  # Skip background (0)
-        points = np.where(labels == label)
-        if len(points[0]):  # If component is not empty
-            y0, x0 = points[0].min(), points[1].min()
-            y1, x1 = points[0].max(), points[1].max()
-            # Add small padding
-            x0 = max(0, x0 - 2)
-            y0 = max(0, y0 - 2)
-            x1 = min(width, x1 + 2)
-            y1 = min(height, y1 + 2)
-            # Find all original boxes that overlap with this merged box
-            box_indices = set()
-            for y in range(y0, y1+1):
-                for x in range(x0, x1+1):
-                    if (y, x) in box_indices_map:
-                        box_indices.update(box_indices_map[(y, x)])
-            # Combine text from all overlapping boxes
-            combined_text = ' '.join([texts[idx] for idx in box_indices])
-            merged_results.append({
-                'box': [[x0, y0], [x1, y0], [x1, y1], [x0, y1]],
-                'text': combined_text
-            })
-    return merged_results
-def is_box_inside_yolo(box, yolo_boxes, overlap_threshold=0.5):
-    """
-    Check if a text box is inside any of the YOLO-detected speech bubbles.
-    box: [[x0,y0], [x1,y0], [x1,y1], [x0,y1]]
-    yolo_boxes: list of YOLO boxes in xywh format
-    overlap_threshold: minimum overlap ratio required to consider the text inside bubble
-    """
-    text_poly = Polygon(box)
-    text_area = text_poly.area
-    for yolo_box in yolo_boxes:
-        x_center, y_center, width, height = yolo_box
-        x1, y1 = x_center - width / 2, y_center - height / 2
-        x2, y2 = x_center + width / 2, y_center + height / 2
-        bubble_box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
-        bubble_poly = Polygon(bubble_box)
-        # Calculate intersection
-        if text_poly.intersects(bubble_poly):
-            intersection = text_poly.intersection(bubble_poly)
-            overlap_ratio = intersection.area / text_area
-            if overlap_ratio >= overlap_threshold:
-                return True
-    return False
-def remove_text_regions(image, boxes, yolo_boxes):
-    """Fill detected text regions with white"""
-    img_removed = image.copy()
-    mask = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
-    # Fill all detected boxes with white
-    for box in boxes:
-        pts = np.array(box[0], dtype=np.int32)
-        if is_box_inside_yolo(box[0], yolo_boxes):
-            cv2.fillPoly(img_removed, [pts], (255, 255, 255, 255))
-        cv2.fillPoly(mask, [pts], (255, 255, 255, 255))
-    img_removed_rgb = cv2.cvtColor(img_removed, cv2.COLOR_BGR2RGB)
-    return img_removed_rgb, mask
-def fit_text_to_box(text, merged_coordinates, angle=0, font_path):
-    """
-    Adjusts the text to fit optimally inside the given box dimensions.
-    Args:
-        text (str): The text to fit.
-        box_size (tuple): A tuple (width, height) specifying the box dimensions.
-        font_path (str): Path to the font file to be used.
-    Returns:
-        PIL.Image: An image with the text fitted inside the box.
-    """
-    width, height = merged_coordinates[1][0] - merged_coordinates[0][0], merged_coordinates[2][1] - merged_coordinates[1][1]
-    font_size = 1
-    # Create a dummy image to measure text size
-    dummy_image = Image.new('RGB', (width, height))
-    draw = ImageDraw.Draw(dummy_image)
-    # Load a small font initially
-    font = ImageFont.truetype(font_path, font_size)
-    while True:
-        # Break text into lines that fit within the width
-        words = text.split()
-        lines = []
-        current_line = []
-        for word in words:
-            test_line = " ".join(current_line + [word])
-            test_width = draw.textlength(test_line, font=font)
-            if test_width <= width:
-                current_line.append(word)
-            else:
-                lines.append(" ".join(current_line))
-                current_line = [word]
-        if current_line:
-            lines.append(" ".join(current_line))
-        # Calculate total height required for the lines
-        line_height = font.getbbox('A')[3] + 5  # Add line spacing
-        total_height = len(lines) * line_height
-        # Check if text fits within the height
-        if total_height > height or any(draw.textlength(line, font=font) > width for line in lines):
-            break
-        # Increment font size
-        font_size += 1
-        font = ImageFont.truetype(font_path, font_size)
-    # Use the last fitting font
-    font_size -= 1
-    font = ImageFont.truetype(font_path, font_size)
-    # Create the final image with a transparent background
-    image = Image.new('RGBA', (width, height), (255, 255, 255, 0))
-    draw = ImageDraw.Draw(image)
-    # Center the text vertically and horizontally
-    lines = []
-    current_line = []
-    for word in text.split():
-        test_line = " ".join(current_line + [word])
-        if draw.textlength(test_line, font=font) <= width:
-            current_line.append(word)
-        else:
-            lines.append(" ".join(current_line))
-            current_line = [word]
-    if current_line:
-        lines.append(" ".join(current_line))
-    line_height = font.getbbox('A')[3] + 5
-    total_text_height = len(lines) * line_height
-    y_offset = (height - total_text_height) // 2
-    for line in lines:
-        text_width = draw.textlength(line, font=font)
-        x_offset = (width - text_width) // 2
-        draw.text((x_offset, y_offset), line, font=font, fill="black")
-        y_offset += line_height
-    rotated_image = image.rotate(0, expand=True)
-    return rotated_image
-def shorten_box(merged_coordinates, pct=0):
-    # Calculate the center of the box
-    center_x = (merged_coordinates[0][0] + merged_coordinates[2][0]) / 2
-    center_y = (merged_coordinates[0][1] + merged_coordinates[2][1]) / 2
-    # Calculate the width and height of the box
-    width = merged_coordinates[1][0] - merged_coordinates[0][0]
-    height = merged_coordinates[2][1] - merged_coordinates[1][1]
-    # Shrink width and height by 10%
-    new_width = width * 1-pct/100.
-    new_height = height * 1-pct/100.
-    # Calculate the new coordinates
-    merged_coordinates_new = np.array([
-        [center_x - new_width / 2, center_y - new_height / 2],  # Top-left
-        [center_x + new_width / 2, center_y - new_height / 2],  # Top-right
-        [center_x + new_width / 2, center_y + new_height / 2],  # Bottom-right
-        [center_x - new_width / 2, center_y + new_height / 2]   # Bottom-left
-    ], dtype=int)
-    return merged_coordinates_new
-def detect_and_show_text(reader, image):
-    """Detect text and show bounding boxes"""
-    if isinstance(image, Image.Image):
-        img_array = np.array(image)
-    else:
-        img_array = image
-    # Get YOLO results first
-    yolo_results = bubble_detection_model(img_array, conf=7)[0]
-    yolo_boxes = yolo_results.boxes.xywh.cpu().numpy()  # Get YOLO boxes in xywh format
-    # Detect text
-    results = reader.readtext(img_array, text_threshold=0.6)
-    # Create visualization
-    img_boxes = img_array.copy()
-    # Ensure we're working with RGB
-    if len(img_array.shape) == 3:
-        if img_array.shape[2] == 3:  # If it's a 3-channel image
-            img_boxes = cv2.cvtColor(img_boxes, cv2.COLOR_BGR2RGB)
-    # Draw original EasyOCR boxes on img_boxes
-    for result in results:
-        pts = np.array(result[0], dtype=np.int32)
-        cv2.polylines(img_boxes, [pts], isClosed=True, color=(0, 255, 0), thickness=2)  # Draw original boxes in green
-    # Remove text and merge boxes for visualization
-    img_removed, mask = remove_text_regions(img_array, results, yolo_boxes)
-    merged_results = merge_boxes(results, img_array.shape)
-    # Draw merged detection boxes and text (if needed)
-    for merged_result in merged_results:
-        pts = np.array(merged_result['box'], dtype=np.int32)
-        # Color the box red if inside bubble, blue if outside
-        color = (0, 0, 255) if is_box_inside_yolo(merged_result['box'], yolo_boxes) else (255, 0, 0)
-        cv2.polylines(img_boxes, [pts], True, color, 2)  # Draw merged boxes in red or blue
-    # Convert to RGB
-    img_boxes_rgb = cv2.cvtColor(img_boxes, cv2.COLOR_BGR2RGB)
-    img_removed_rgb = cv2.cvtColor(img_removed, cv2.COLOR_BGR2RGB)
-    mask_rgba = cv2.cvtColor(mask, cv2.COLOR_RGB2RGBA)
-    # Get YOLO visualization without labels
-    bubbles_img = yolo_results.plot(labels=False)
-    # Convert to PIL Images
-    img_boxes_pil = Image.fromarray(img_boxes_rgb)
-    img_removed_pil = Image.fromarray(img_removed_rgb)
-    bubbles_img_pil = Image.fromarray(bubbles_img)
-    mask_pil = Image.fromarray(mask_rgba)
-    return img_boxes_pil, bubbles_img_pil, img_removed_pil, merged_results, mask_pil
-def position_text_back(text, merged_coordinates, inpainted_image, font_path):
-    coords = shorten_box(merged_coordinates)
-    top_left_coords = coords[0]
-    text_image = fit_text_to_box(text, coords, font_path)
-    # Create a transparent layer to blend
-    layer = Image.new("RGBA", inpainted_image.size, (0, 0, 0, 0))
-    # Paste the text image onto the transparent layer at the specified position
-    layer.paste(text_image, tuple(top_left_coords), mask=text_image)
-    # Ensure both images are in "RGBA" mode
-    if inpainted_image.mode != "RGBA":
-        inpainted_image = inpainted_image.convert("RGBA")
-    if layer.mode != "RGBA":
-        layer = layer.convert("RGBA")
-    # Blend the transparent layer with the inpainted image
-    blended_image = Image.alpha_composite(inpainted_image, layer)
-    return blended_image
-@spaces.GPU()
-def process(image, mask,
-            prompt="background",
-            negative_prompt="text",
-            controlnet_conditioning_scale=0.9,
-            guidance_scale=3.5,
-            seed=124,
-            num_inference_steps=10,
-            true_guidance_scale=3.5
-            ):
-    size = (768, 768)
-    image_pil = Image.fromarray(image)
-    image_or = image_pil.copy()
-    image_pil = image_pil.convert("RGB").resize(size)
-    mask = mask.convert("RGB").resize(size)
-    generator = torch.Generator(device="cuda").manual_seed(seed)
-    result = pipe(
-    prompt=prompt,
-    height=size[1],
-    width=size[0],
-    control_image=image_pil,
-    control_mask=mask,
-    num_inference_steps=num_inference_steps,
-    generator=generator,
-    controlnet_conditioning_scale=controlnet_conditioning_scale,
-    guidance_scale=guidance_scale,
-    negative_prompt=negative_prompt,
-    true_guidance_scale=true_guidance_scale
-    ).images[0]
-    return result.resize((image_or.size[:2]))
-@spaces.GPU()
-def process_image(image, source_language, target_language, mode, font):
-    """Main processing function for Gradio"""
-    if image is None:
-        return None, None, None, []
-    # Initialize reader (equivalent to what handle_localization did)
-    easy_ocr_lan = language_to_ocr.get(source_language, 'en')
-    reader = easyocr.Reader([easy_ocr_lan], model_storage_directory='.', gpu=False)
-    # Detect text and get results
-    img_with_boxes, img_bubbles, img_removed_text, merged_results, mask = detect_and_show_text(reader, image)
-    if mode == "Basic":
-        img_inpainted = img_removed_text
-    else:
-        img_inpainted = process(image, mask)
-    # Get translations
-    translations = localize_boxes(
-        merged_results,
-        img_with_boxes,
-        source_language,
-        target_language
-    )
-    # Create initial result with translations
-    final_result = img_inpainted.copy()
-    for translation in translations:
-        box = translation['box']
-        text = translation['text']
-        final_result = position_text_back(text, box, final_result, font_path=f"fonts/{font}.ttf")
-    # Return all results directly (no need to store in session state)
-    return img_with_boxes, img_bubbles, img_inpainted, final_result, translations, np.array(mask)
-def update_translations(image, edited_texts, translations_list, img_removed_text, font):
-    """Update the image with edited translations"""
-    if image is None or img_removed_text is None:
-        return None
-    # Convert numpy array back to PIL Image
-    img_removed = Image.fromarray(img_removed_text)
-    final_result = img_removed.copy()
-    # Update the translations with edited texts
-    for trans, new_text in zip(translations_list, edited_texts.split('\n')):
-        trans['text'] = new_text.strip()
-        box = trans['box']
-        final_result = position_text_back(new_text, box, final_result, font_path=f"fonts/{font}.ttf")
-    return np.array(final_result)
-with gr.Blocks(title="Nativ - Demo") as demo:
-    # Store translations list in state
-    translations_state = gr.State([])
-    gr.Markdown("# Nativ - Demo")
-    with gr.Row():
-        with gr.Column():
-            # Input components
-            input_image = gr.Image(type="numpy", label="Upload Image")
-            source_language = gr.Dropdown(
-                choices=['Simplified Chinese', 'Traditional Chinese', 'Korean', 'Japanese', 'English'],
-                value='Simplified Chinese',
-                label="Source Language"
-            )
-            target_language = gr.Dropdown(
-                choices=['English', 'Spanish', 'Chinese', 'Korean', 'French', 'Japanese'],
-                value='English',
-                label="Target Language"
-            )
-            # Toggle for mode selection
-            localization_mode = gr.Radio(
-                choices=["Basic", "Advanced"],
-                value="Basic",
-                label="Localization Mode"
-            )
-            font_selector_i = gr.Dropdown(
-            choices=['Arial', 'Ldfcomicsansbold', 'Times New Roman', 'georgia', 'calibri', 'Verdana', 'omniscript_bold', 'helvetica'],  # Add more fonts as needed
-            value='omniscript_bold',
-                label="Select Font"
-            )
-            process_btn = gr.Button("Localize")
-        with gr.Column():
-            # Output components
-            speech_bubbles = gr.Image(type="numpy", label="Detected Speech Bubbles", interactive=False)
-            detected_boxes = gr.Image(type="numpy", label="Detected Text Regions", interactive=False)
-            removed_text = gr.Image(type="numpy", label="Removed Text", interactive=False)
-            final_output = gr.Image(type="numpy", label="Final Result", interactive=False)
-    # Translation editing section
-    with gr.Row():
-        translations_text = gr.Textbox(
-            label="Edit Translations (one per line)",
-            lines=5,
-            placeholder="Edit translations here..."
-        )
-        font_selector_f = gr.Dropdown(
-            choices=['Arial', 'Ldfcomicsansbold', 'Times New Roman', 'georgia', 'calibri', 'Verdana', 'omniscript_bold', 'helvetica'],  # Add more fonts as needed
-            value='Arial',
-            label="Select Font"
-        )
-        update_btn = gr.Button("Apply Changes")
-    def process_and_show_translations(image, source_lang, target_lang, mode, font):
-        boxes, bubbles, removed, final, translations, mask = process_image(image, source_lang, target_lang, mode, font)
-        # Extract just the texts and join with newlines
-        texts = '\n'.join(t['text'] for t in translations)
-        return boxes, bubbles, removed, final, texts, translations
-    # Process button click
-    process_btn.click(
-        fn=process_and_show_translations,
-        inputs=[input_image, source_language, target_language, localization_mode, font_selector_i],
-        outputs=[detected_boxes, speech_bubbles, removed_text, final_output, translations_text, translations_state]
-    )
-    # Update translations button click
-    update_btn.click(
-        fn=update_translations,
-        inputs=[input_image, translations_text, translations_state, removed_text, font_selector_f],
-        outputs=final_output
-    )
-demo.launch(debug=False, show_error=True,share=True)