Futuretop commited on
Commit
f92c4cd
ยท
verified ยท
1 Parent(s): aa79ca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -150
app.py CHANGED
@@ -1,154 +1,93 @@
1
- from PIL import Image
2
- from transformers import BlipProcessor, BlipForConditionalGeneration
3
- import numpy as np
4
- import cv2
5
- from deepface import DeepFace
6
  import gradio as gr
7
-
8
- # Load BLIP model
9
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
10
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
11
-
12
- # Clothing extractor
13
- def extract_clothing(text):
14
- colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
15
- patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
16
- items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
17
- 'suit', 'sneakers', 'hat', 'scarf', 'uniform']
18
-
19
- found_colors = [c for c in colors if c in text.lower()]
20
- found_patterns = [p for p in patterns if p in text.lower()]
21
- found_items = [i for i in items if i in text.lower()]
22
-
23
- return found_colors, found_patterns, found_items
24
-
25
- # Main function
26
- def analyze_image(image_pil):
27
- image_pil = image_pil.convert("RGB")
28
- image_np = np.array(image_pil)
29
-
30
- # Caption generation
31
- inputs = processor(image_pil, return_tensors="pt")
32
- out = model.generate(**inputs)
33
- caption = processor.decode(out[0], skip_special_tokens=True)
34
-
35
- # Convert to BGR for DeepFace
36
- image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
37
-
38
- # Face detection using DeepFace with RetinaFace backend
39
- try:
40
- faces = DeepFace.extract_faces(img_path=image_bgr, detector_backend="retinaface", enforce_detection=False)
41
- print(f"DeepFace detected {len(faces)} face(s)")
42
- except Exception as e:
43
- print("DeepFace error:", e)
44
- faces = []
45
-
46
- face_infos = []
47
- for face_data in faces:
48
- face_crop = face_data["face"]
49
- try:
50
- analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
51
- age = analysis[0]['age']
52
- gender = analysis[0]['gender']
53
- emotion = analysis[0]['dominant_emotion']
54
-
55
- if age < 13:
56
- age_group = "child"
57
- elif age < 20:
58
- age_group = "teen"
59
- elif age < 60:
60
- age_group = "adult"
61
- else:
62
- age_group = "senior"
63
-
64
- face_infos.append({
65
- "age": age,
66
- "gender": gender,
67
- "age_group": age_group,
68
- "emotion": emotion
69
- })
70
- except Exception:
71
- continue
72
-
73
- # Summary stats
74
- num_faces = len(face_infos)
75
- gender_counts = {"Man": 0, "Woman": 0}
76
- age_summary = {}
77
- emotion_summary = {}
78
-
79
- for face in face_infos:
80
- gender = face['gender']
81
- age_group = face['age_group']
82
- emotion = face['emotion']
83
-
84
- gender_counts[gender] += 1
85
- age_summary[age_group] = age_summary.get(age_group, 0) + 1
86
- emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
87
-
88
- # Clothing info from caption
89
- colors, patterns, items = extract_clothing(caption)
90
-
91
- # Generate 15 sentences
92
- sentences = []
93
- sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
94
- sentences.append(f"The image contains {num_faces} visible face(s) detected using DeepFace (RetinaFace backend).")
95
-
96
- gender_desc = []
97
- if gender_counts["Man"] > 0:
98
- gender_desc.append(f"{gender_counts['Man']} male(s)")
99
- if gender_counts["Woman"] > 0:
100
- gender_desc.append(f"{gender_counts['Woman']} female(s)")
101
- if gender_desc:
102
- sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
103
- else:
104
- sentences.append("Gender analysis was inconclusive.")
105
-
106
- if age_summary:
107
- age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
108
- sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
109
- else:
110
- sentences.append("No conclusive age groupings found.")
111
-
112
- if emotion_summary:
113
- emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
114
- sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
115
- else:
116
- sentences.append("Emotion detection yielded limited results.")
117
-
118
- if colors or patterns or items:
119
- cloth_parts = []
120
- if colors:
121
- cloth_parts.append(f"colors like {', '.join(colors)}")
122
- if patterns:
123
- cloth_parts.append(f"patterns such as {', '.join(patterns)}")
124
- if items:
125
- cloth_parts.append(f"items like {', '.join(items)}")
126
- sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
127
- else:
128
- sentences.append("Clothing details were not clearly identified.")
129
-
130
- if num_faces > 0:
131
- sentences.append("Faces are distributed naturally across the image.")
132
- sentences.append("Differences in face size suggest variation in distance from the camera.")
133
- sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
134
- sentences.append("Lighting emphasizes certain facial features and expressions.")
135
- sentences.append("Some individuals face the camera while others look away.")
136
- sentences.append("Mood diversity is reflected in the variety of facial expressions.")
137
- sentences.append("The clothing style appears casual or semi-formal.")
138
- else:
139
- sentences.append("No visible faces were found to analyze further visual characteristics.")
140
-
141
- sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
142
-
143
- return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
144
-
145
- # Gradio Interface
146
- demo = gr.Interface(
147
- fn=analyze_image,
148
- inputs=gr.Image(type="pil"),
149
- outputs=gr.Textbox(label="๐Ÿ“ 15-Sentence Detailed Description"),
150
- title="๐Ÿ–ผ๏ธ Image Analysis with BLIP + DeepFace",
151
- description ="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
152
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import random
4
+ from PIL import Image
5
+ from transformers import AutoProcessor, AutoModelForCausalLM
6
+ from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler
7
+
8
+ # Florence-2 ๋กœ๋“œ
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
11
+ florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
12
+
13
+ # Stable Diffusion TurboX ๋กœ๋“œ
14
+ model_repo = "tensorart/stable-diffusion-3.5-large-TurboX"
15
+ pipe = DiffusionPipeline.from_pretrained(
16
+ model_repo,
17
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  )
19
+ pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_repo, subfolder="scheduler", shift=5)
20
+ pipe = pipe.to(device)
21
+
22
+ MAX_SEED = 2**31 - 1
23
+
24
+ def pseudo_translate_to_korean_style(en_prompt: str) -> str:
25
+ # ๋ฒˆ์—ญ ์—†์ด ์Šคํƒ€์ผ ์ ์šฉ
26
+ return f"์ด ์žฅ๋ฉด์€ {en_prompt} ์žฅ๋ฉด์ž…๋‹ˆ๋‹ค. ๋ฐ๊ณ  ๊ท€์—ฌ์šด ์นดํˆฐ ์Šคํƒ€์ผ๋กœ ๊ทธ๋ ค์ฃผ์„ธ์š”. ๋””์ง€ํ„ธ ์ผ๋Ÿฌ์ŠคํŠธ ๋А๋‚Œ์œผ๋กœ ๋ฌ˜์‚ฌํ•ด ์ฃผ์„ธ์š”."
27
+
28
+ def generate_prompt(image):
29
+ """์ด๋ฏธ์ง€ โ†’ ์˜์–ด ์„ค๋ช… โ†’ ํ•œ๊ตญ์–ด ํ”„๋กฌํ”„ํŠธ ์Šคํƒ€์ผ๋กœ ๋ณ€ํ™˜"""
30
+ if not isinstance(image, Image.Image):
31
+ image = Image.fromarray(image)
32
+
33
+ inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
34
+ generated_ids = florence_model.generate(
35
+ input_ids=inputs["input_ids"],
36
+ pixel_values=inputs["pixel_values"],
37
+ max_new_tokens=512,
38
+ num_beams=3
39
+ )
40
+ generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
41
+ parsed_answer = florence_processor.post_process_generation(
42
+ generated_text,
43
+ task="<MORE_DETAILED_CAPTION>",
44
+ image_size=(image.width, image.height)
45
+ )
46
+ prompt_en = parsed_answer["<MORE_DETAILED_CAPTION>"]
47
+
48
+ # ๋ฒˆ์—ญ๊ธฐ ์—†์ด ์Šคํƒ€์ผ ์ ์šฉ
49
+ cartoon_prompt = pseudo_translate_to_korean_style(prompt_en)
50
+ return cartoon_prompt
51
+
52
+ def generate_image(prompt, seed=42, randomize_seed=False):
53
+ """ํ…์ŠคํŠธ ํ”„๋กฌํ”„ํŠธ โ†’ ์ด๋ฏธ์ง€ ์ƒ์„ฑ"""
54
+ if randomize_seed:
55
+ seed = random.randint(0, MAX_SEED)
56
+ generator = torch.Generator().manual_seed(seed)
57
+ image = pipe(
58
+ prompt=prompt,
59
+ negative_prompt="์™œ๊ณก๋œ ์†, ํ๋ฆผ, ์ด์ƒํ•œ ์–ผ๊ตด",
60
+ guidance_scale=1.5,
61
+ num_inference_steps=8,
62
+ width=768,
63
+ height=768,
64
+ generator=generator
65
+ ).images[0]
66
+ return image, seed
67
+
68
+ # Gradio UI ๊ตฌ์„ฑ
69
+ with gr.Blocks() as demo:
70
+ gr.Markdown("# ๐Ÿ–ผ ์ด๋ฏธ์ง€ โ†’ ์„ค๋ช… ์ƒ์„ฑ โ†’ ์นดํˆฐ ์ด๋ฏธ์ง€ ์ž๋™ ์ƒ์„ฑ๊ธฐ")
71
+
72
+ gr.Markdown("**๐Ÿ“Œ ์‚ฌ์šฉ๋ฒ• ์•ˆ๋‚ด (ํ•œ๊ตญ์–ด)**\n"
73
+ "- ์™ผ์ชฝ์— ์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.\n"
74
+ "- AI๊ฐ€ ์˜์–ด ์„ค๋ช…์„ ๋งŒ๋“ค๊ณ , ๋‚ด๋ถ€์—์„œ ํ•œ๊ตญ์–ด ์Šคํƒ€์ผ ํ”„๋กฌํ”„ํŠธ๋กœ ์žฌ๊ตฌ์„ฑํ•ฉ๋‹ˆ๋‹ค.\n"
75
+ "- ์˜ค๋ฅธ์ชฝ์— ๊ฒฐ๊ณผ ์ด๋ฏธ์ง€๊ฐ€ ์ƒ์„ฑ๋ฉ๋‹ˆ๋‹ค.")
76
+
77
+ with gr.Row():
78
+ with gr.Column():
79
+ input_img = gr.Image(label="๐ŸŽจ ์›๋ณธ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ")
80
+ run_button = gr.Button("โœจ ์ƒ์„ฑ ์‹œ์ž‘")
81
+
82
+ with gr.Column():
83
+ prompt_out = gr.Textbox(label="๐Ÿ“ ์Šคํƒ€์ผ ์ ์šฉ๋œ ํ”„๋กฌํ”„ํŠธ", lines=3, show_copy_button=True)
84
+ output_img = gr.Image(label="๐ŸŽ‰ ์ƒ์„ฑ๋œ ์ด๋ฏธ์ง€")
85
+
86
+ def full_process(img):
87
+ prompt = generate_prompt(img)
88
+ image, seed = generate_image(prompt, randomize_seed=True)
89
+ return prompt, image
90
+
91
+ run_button.click(fn=full_process, inputs=[input_img], outputs=[prompt_out, output_img])
92
 
93
  demo.launch()