not-lain commited on
Commit
5919468
·
1 Parent(s): ffa1375

remove sam2

Browse files
Files changed (2) hide show
  1. app.py +101 -54
  2. requirements.txt +1 -1
app.py CHANGED
@@ -3,11 +3,11 @@ import spaces
3
  import torch
4
  from loadimg import load_img
5
  from torchvision import transforms
6
- from transformers import AutoModelForImageSegmentation
7
  from diffusers import FluxFillPipeline
8
  from PIL import Image, ImageOps
9
 
10
- from sam2.sam2_image_predictor import SAM2ImagePredictor
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
@@ -134,36 +134,36 @@ def rmbg(image=None, url=None):
134
  return image
135
 
136
 
137
- def mask_generation(image=None, d=None):
138
- # use bfloat16 for the entire notebook
139
- # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
140
- # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
141
- # if torch.cuda.get_device_properties(0).major >= 8:
142
- # torch.backends.cuda.matmul.allow_tf32 = True
143
- # torch.backends.cudnn.allow_tf32 = True
144
- d = eval(d) # convert this to dictionary
145
- with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
146
- predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
147
- predictor.set_image(image)
148
- input_point = np.array(d["input_points"])
149
- input_label = np.array(d["input_labels"])
150
- masks, scores, logits = predictor.predict(
151
- point_coords=input_point,
152
- point_labels=input_label,
153
- multimask_output=True,
154
- )
155
- sorted_ind = np.argsort(scores)[::-1]
156
- masks = masks[sorted_ind]
157
- scores = scores[sorted_ind]
158
- logits = logits[sorted_ind]
159
-
160
- out = []
161
- for i in range(len(masks)):
162
- m = Image.fromarray(masks[i] * 255).convert("L")
163
- comp = Image.composite(image, m, m)
164
- out.append((comp, f"image {i}"))
165
-
166
- return out
167
 
168
 
169
  def erase(image=None, mask=None):
@@ -173,6 +173,25 @@ def erase(image=None, mask=None):
173
  return simple_lama(image, mask)
174
 
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  @spaces.GPU(duration=120)
177
  def main(*args):
178
  api_num = args[0]
@@ -183,10 +202,12 @@ def main(*args):
183
  return outpaint(*args)
184
  elif api_num == 3:
185
  return inpaint(*args)
186
- elif api_num == 4:
187
- return mask_generation(*args)
188
  elif api_num == 5:
189
  return erase(*args)
 
 
190
 
191
 
192
  rmbg_tab = gr.Interface(
@@ -241,24 +262,24 @@ inpaint_tab = gr.Interface(
241
  )
242
 
243
 
244
- sam2_tab = gr.Interface(
245
- main,
246
- inputs=[
247
- gr.Number(4, interactive=False),
248
- gr.Image(type="pil"),
249
- gr.Text(),
250
- ],
251
- outputs=gr.Gallery(),
252
- examples=[
253
- [
254
- 4,
255
- "./assets/truck.jpg",
256
- '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
257
- ]
258
- ],
259
- api_name="sam2",
260
- cache_examples=False,
261
- )
262
 
263
  erase_tab = gr.Interface(
264
  main,
@@ -279,9 +300,35 @@ erase_tab = gr.Interface(
279
  cache_examples=False,
280
  )
281
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  demo = gr.TabbedInterface(
283
- [rmbg_tab, outpaint_tab, inpaint_tab, sam2_tab, erase_tab],
284
- ["remove background", "outpainting", "inpainting", "sam2", "erase"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  title="Utilities that require GPU",
286
  )
287
 
 
3
  import torch
4
  from loadimg import load_img
5
  from torchvision import transforms
6
+ from transformers import AutoModelForImageSegmentation, pipeline
7
  from diffusers import FluxFillPipeline
8
  from PIL import Image, ImageOps
9
 
10
+ # from sam2.sam2_image_predictor import SAM2ImagePredictor
11
  import numpy as np
12
  from simple_lama_inpainting import SimpleLama
13
  from contextlib import contextmanager
 
134
  return image
135
 
136
 
137
+ # def mask_generation(image=None, d=None):
138
+ # # use bfloat16 for the entire notebook
139
+ # # torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
140
+ # # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
141
+ # # if torch.cuda.get_device_properties(0).major >= 8:
142
+ # # torch.backends.cuda.matmul.allow_tf32 = True
143
+ # # torch.backends.cudnn.allow_tf32 = True
144
+ # d = eval(d) # convert this to dictionary
145
+ # with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
146
+ # predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2.1-hiera-large")
147
+ # predictor.set_image(image)
148
+ # input_point = np.array(d["input_points"])
149
+ # input_label = np.array(d["input_labels"])
150
+ # masks, scores, logits = predictor.predict(
151
+ # point_coords=input_point,
152
+ # point_labels=input_label,
153
+ # multimask_output=True,
154
+ # )
155
+ # sorted_ind = np.argsort(scores)[::-1]
156
+ # masks = masks[sorted_ind]
157
+ # scores = scores[sorted_ind]
158
+ # logits = logits[sorted_ind]
159
+
160
+ # out = []
161
+ # for i in range(len(masks)):
162
+ # m = Image.fromarray(masks[i] * 255).convert("L")
163
+ # comp = Image.composite(image, m, m)
164
+ # out.append((comp, f"image {i}"))
165
+
166
+ # return out
167
 
168
 
169
  def erase(image=None, mask=None):
 
173
  return simple_lama(image, mask)
174
 
175
 
176
+ # Initialize Whisper model
177
+ whisper = pipeline(
178
+ task="automatic-speech-recognition",
179
+ model="openai/whisper-large-v3",
180
+ chunk_length_s=30,
181
+ device="cuda" if torch.cuda.is_available() else "cpu",
182
+ )
183
+
184
+
185
+ def transcribe(audio, task="transcribe"):
186
+ if audio is None:
187
+ raise gr.Error("No audio file submitted!")
188
+
189
+ text = whisper(
190
+ audio, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True
191
+ )["text"]
192
+ return text
193
+
194
+
195
  @spaces.GPU(duration=120)
196
  def main(*args):
197
  api_num = args[0]
 
202
  return outpaint(*args)
203
  elif api_num == 3:
204
  return inpaint(*args)
205
+ # elif api_num == 4:
206
+ # return mask_generation(*args)
207
  elif api_num == 5:
208
  return erase(*args)
209
+ elif api_num == 6:
210
+ return transcribe(*args)
211
 
212
 
213
  rmbg_tab = gr.Interface(
 
262
  )
263
 
264
 
265
+ # sam2_tab = gr.Interface(
266
+ # main,
267
+ # inputs=[
268
+ # gr.Number(4, interactive=False),
269
+ # gr.Image(type="pil"),
270
+ # gr.Text(),
271
+ # ],
272
+ # outputs=gr.Gallery(),
273
+ # examples=[
274
+ # [
275
+ # 4,
276
+ # "./assets/truck.jpg",
277
+ # '{"input_points": [[500, 375], [1125, 625]], "input_labels": [1, 0]}',
278
+ # ]
279
+ # ],
280
+ # api_name="sam2",
281
+ # cache_examples=False,
282
+ # )
283
 
284
  erase_tab = gr.Interface(
285
  main,
 
300
  cache_examples=False,
301
  )
302
 
303
+ transcribe_tab = gr.Interface(
304
+ fn=main,
305
+ inputs=[
306
+ gr.Number(6, interactive=False),
307
+ gr.Audio(source="upload", type="filepath"),
308
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
309
+ ],
310
+ outputs="text",
311
+ api_name="transcribe",
312
+ description="Upload an audio file to extract text using Whisper Large V3",
313
+ )
314
+
315
  demo = gr.TabbedInterface(
316
+ [
317
+ rmbg_tab,
318
+ outpaint_tab,
319
+ inpaint_tab,
320
+ # sam2_tab,
321
+ erase_tab,
322
+ transcribe_tab,
323
+ ],
324
+ [
325
+ "remove background",
326
+ "outpainting",
327
+ "inpainting",
328
+ # "sam2",
329
+ "erase",
330
+ "transcribe",
331
+ ],
332
  title="Utilities that require GPU",
333
  )
334
 
requirements.txt CHANGED
@@ -19,6 +19,6 @@ kornia
19
  huggingface_hub
20
  sentencepiece
21
  einops
22
- git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
  simple-lama-inpainting
 
19
  huggingface_hub
20
  sentencepiece
21
  einops
22
+ # git+https://github.com/facebookresearch/sam2.git
23
  matplotlib
24
  simple-lama-inpainting