Spaces:

Futuretop
/

CaricatureGenerator-4.0

Runtime error

App Files Files Community

Futuretop commited on May 21

Commit

f92c4cd

verified ·

1 Parent(s): aa79ca0

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -150

app.py CHANGED Viewed

@@ -1,154 +1,93 @@
-from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration
-import numpy as np
-import cv2
-from deepface import DeepFace
 import gradio as gr
-# Load BLIP model
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-# Clothing extractor
-def extract_clothing(text):
-    colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
-    patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
-    items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
-             'suit', 'sneakers', 'hat', 'scarf', 'uniform']
-    found_colors = [c for c in colors if c in text.lower()]
-    found_patterns = [p for p in patterns if p in text.lower()]
-    found_items = [i for i in items if i in text.lower()]
-    return found_colors, found_patterns, found_items
-# Main function
-def analyze_image(image_pil):
-    image_pil = image_pil.convert("RGB")
-    image_np = np.array(image_pil)
-    # Caption generation
-    inputs = processor(image_pil, return_tensors="pt")
-    out = model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    # Convert to BGR for DeepFace
-    image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
-    # Face detection using DeepFace with RetinaFace backend
-    try:
-        faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
-        print(f"DeepFace detected {len(faces)} face(s)")
-    except Exception as e:
-        print("DeepFace error:", e)
-        faces = []
-    face_infos = []
-    for face_data in faces:
-        face_crop = face_data["face"]
-        try:
-            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
-            age = analysis[0]['age']
-            gender = analysis[0]['gender']
-            emotion = analysis[0]['dominant_emotion']
-            if age < 13:
-                age_group = "child"
-            elif age < 20:
-                age_group = "teen"
-            elif age < 60:
-                age_group = "adult"
-            else:
-                age_group = "senior"
-            face_infos.append({
-                "age": age,
-                "gender": gender,
-                "age_group": age_group,
-                "emotion": emotion
-            })
-        except Exception:
-            continue
-    # Summary stats
-    num_faces = len(face_infos)
-    gender_counts = {"Man": 0, "Woman": 0}
-    age_summary = {}
-    emotion_summary = {}
-    for face in face_infos:
-        gender = face['gender']
-        age_group = face['age_group']
-        emotion = face['emotion']
-        gender_counts[gender] += 1
-        age_summary[age_group] = age_summary.get(age_group, 0) + 1
-        emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
-    # Clothing info from caption
-    colors, patterns, items = extract_clothing(caption)
-    # Generate 15 sentences
-    sentences = []
-    sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
-    sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")
-    gender_desc = []
-    if gender_counts["Man"] > 0:
-        gender_desc.append(f"{gender_counts['Man']} male(s)")
-    if gender_counts["Woman"] > 0:
-        gender_desc.append(f"{gender_counts['Woman']} female(s)")
-    if gender_desc:
-        sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
-    else:
-        sentences.append("Gender analysis was inconclusive.")
-    if age_summary:
-        age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
-        sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
-    else:
-        sentences.append("No conclusive age groupings found.")
-    if emotion_summary:
-        emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
-        sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
-    else:
-        sentences.append("Emotion detection yielded limited results.")
-    if colors or patterns or items:
-        cloth_parts = []
-        if colors:
-            cloth_parts.append(f"colors like {', '.join(colors)}")
-        if patterns:
-            cloth_parts.append(f"patterns such as {', '.join(patterns)}")
-        if items:
-            cloth_parts.append(f"items like {', '.join(items)}")
-        sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
-    else:
-        sentences.append("Clothing details were not clearly identified.")
-    if num_faces > 0:
-        sentences.append("Faces are distributed naturally across the image.")
-        sentences.append("Differences in face size suggest variation in distance from the camera.")
-        sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
-        sentences.append("Lighting emphasizes certain facial features and expressions.")
-        sentences.append("Some individuals face the camera while others look away.")
-        sentences.append("Mood diversity is reflected in the variety of facial expressions.")
-        sentences.append("The clothing style appears casual or semi-formal.")
-    else:
-        sentences.append("No visible faces were found to analyze further visual characteristics.")
-    sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
-    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
-# Gradio Interface
-demo = gr.Interface(
-    fn=analyze_image,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
-    title="🖼️ Image Analysis with BLIP + DeepFace",
-    description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
 )
 demo.launch()

 import gradio as gr
+import torch
+import random
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+# Florence-2 로드
+device = "cuda" if torch.cuda.is_available() else "cpu"
+florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
+florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
+# Stable Diffusion TurboX 로드
+model_repo = "tensorart/stable-diffusion-3.5-large-TurboX"
+pipe = DiffusionPipeline.from_pretrained(
+    model_repo,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 )
+pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_repo, subfolder="scheduler", shift=5)
+pipe = pipe.to(device)
+MAX_SEED = 2**31 - 1
+def pseudo_translate_to_korean_style(en_prompt: str) -> str:
+    # 번역 없이 스타일 적용
+    return f"이 장면은 {en_prompt} 장면입니다. 밝고 귀여운 카툰 스타일로 그려주세요. 디지털 일러스트 느낌으로 묘사해 주세요."
+def generate_prompt(image):
+    """이미지 → 영어 설명 → 한국어 프롬프트 스타일로 변환"""
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=512,
+        num_beams=3
+    )
+    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = florence_processor.post_process_generation(
+        generated_text,
+        task="<MORE_DETAILED_CAPTION>",
+        image_size=(image.width, image.height)
+    )
+    prompt_en = parsed_answer["<MORE_DETAILED_CAPTION>"]
+    # 번역기 없이 스타일 적용
+    cartoon_prompt = pseudo_translate_to_korean_style(prompt_en)
+    return cartoon_prompt
+def generate_image(prompt, seed=42, randomize_seed=False):
+    """텍스트 프롬프트 → 이미지 생성"""
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    image = pipe(
+        prompt=prompt,
+        negative_prompt="왜곡된 손, 흐림, 이상한 얼굴",
+        guidance_scale=1.5,
+        num_inference_steps=8,
+        width=768,
+        height=768,
+        generator=generator
+    ).images[0]
+    return image, seed
+# Gradio UI 구성
+with gr.Blocks() as demo:
+    gr.Markdown("# 🖼 이미지 → 설명 생성 → 카툰 이미지 자동 생성기")
+    gr.Markdown("**📌 사용법 안내 (한국어)**\n"
+                "- 왼쪽에 이미지를 업로드하세요.\n"
+                "- AI가 영어 설명을 만들고, 내부에서 한국어 스타일 프롬프트로 재구성합니다.\n"
+                "- 오른쪽에 결과 이미지가 생성됩니다.")
+    with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(label="🎨 원본 이미지 업로드")
+            run_button = gr.Button("✨ 생성 시작")
+        with gr.Column():
+            prompt_out = gr.Textbox(label="📝 스타일 적용된 프롬프트", lines=3, show_copy_button=True)
+            output_img = gr.Image(label="🎉 생성된 이미지")
+    def full_process(img):
+        prompt = generate_prompt(img)
+        image, seed = generate_image(prompt, randomize_seed=True)
+        return prompt, image
+    run_button.click(fn=full_process, inputs=[input_img], outputs=[prompt_out, output_img])
 demo.launch()