Hantr commited on
Commit
c4854be
ยท
1 Parent(s): c421198
Files changed (1) hide show
  1. app.py +13 -4
app.py CHANGED
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
5
  import numpy as np
6
  from PIL import Image
7
  import tensorflow as tf
8
- from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation, BigBirdForImageCaptioning
9
 
10
  feature_extractor = SegformerFeatureExtractor.from_pretrained(
11
  "nvidia/segformer-b2-finetuned-cityscapes-1024-1024"
@@ -14,7 +14,7 @@ seg_model = TFSegformerForSemanticSegmentation.from_pretrained(
14
  "nvidia/segformer-b2-finetuned-cityscapes-1024-1024"
15
  )
16
 
17
- caption_model = BigBirdForImageCaptioning.from_pretrained("bigbird/image-captioning-base")
18
 
19
  def ade_palette():
20
  """ADE20K palette that maps each class to RGB values."""
@@ -108,20 +108,29 @@ def sepia(input_img):
108
 
109
  def segment_and_caption(input_img):
110
  input_img = Image.fromarray(input_img)
 
 
111
  inputs = feature_extractor(images=input_img, return_tensors="tf")
112
  outputs = seg_model(**inputs)
113
  logits = outputs.logits
 
114
  logits = tf.transpose(logits, [0, 2, 3, 1])
115
  logits = tf.image.resize(
116
  logits, input_img.size[::-1]
117
  )
118
  seg = tf.math.argmax(logits, axis=-1)[0]
 
 
119
  seg_text = ""
120
  for label, label_name in enumerate(labels_list):
121
  count = np.sum(seg.numpy() == label)
122
  seg_text += f"{label_name}: {count} pixels\n"
123
- caption = caption_model.generate(input_img, max_length=20, num_return_sequences=1, return_dict_in_generate=True)
124
- caption_text = caption[0]['text']
 
 
 
 
125
  return input_img, seg_text, caption_text
126
 
127
 
 
5
  import numpy as np
6
  from PIL import Image
7
  import tensorflow as tf
8
+ from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation, AutoFeatureExtractor, AutoModelForImageCaptioning
9
 
10
  feature_extractor = SegformerFeatureExtractor.from_pretrained(
11
  "nvidia/segformer-b2-finetuned-cityscapes-1024-1024"
 
14
  "nvidia/segformer-b2-finetuned-cityscapes-1024-1024"
15
  )
16
 
17
+ caption_model = AutoModelForImageCaptioning.from_pretrained("facebook/deit-base-cc-turbo")
18
 
19
  def ade_palette():
20
  """ADE20K palette that maps each class to RGB values."""
 
108
 
109
  def segment_and_caption(input_img):
110
  input_img = Image.fromarray(input_img)
111
+
112
+ # ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜ ์ˆ˜ํ–‰
113
  inputs = feature_extractor(images=input_img, return_tensors="tf")
114
  outputs = seg_model(**inputs)
115
  logits = outputs.logits
116
+
117
  logits = tf.transpose(logits, [0, 2, 3, 1])
118
  logits = tf.image.resize(
119
  logits, input_img.size[::-1]
120
  )
121
  seg = tf.math.argmax(logits, axis=-1)[0]
122
+
123
+ # ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜ ๊ฒฐ๊ณผ๋ฅผ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜
124
  seg_text = ""
125
  for label, label_name in enumerate(labels_list):
126
  count = np.sum(seg.numpy() == label)
127
  seg_text += f"{label_name}: {count} pixels\n"
128
+
129
+ # ์ด๋ฏธ์ง€ ์บก์…˜ ์ƒ์„ฑ
130
+ caption_input = caption_model.generate(input_img, max_length=20, num_return_sequences=1)
131
+ caption_text = caption_input[0]['text']
132
+
133
+ # ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜ ๊ฒฐ๊ณผ์™€ ์บก์…˜์„ ๋ฐ˜ํ™˜
134
  return input_img, seg_text, caption_text
135
 
136