vishnun commited on
Commit
3f7bb9f
·
1 Parent(s): 52bf2af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -55
app.py CHANGED
@@ -1,76 +1,73 @@
1
- import streamlit as st
2
  import numpy as np
3
  from PIL import Image
4
  from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
5
  import torch
6
 
7
- st.title("CLIP & CROP")
8
- # st.markdown("**Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.**")
 
 
 
9
 
10
- # IMAGE_INPUT = st.file_uploader(type=["jpg", "png"], label="Input image")
11
- # TEXT_INPUT = st.text_input(label="Description for section to extracted")
12
- # NUMBER_INPUT = st.number_input(value=0.96, label="Threshold percentage score")
13
 
 
 
14
 
15
- # with st.spinner("Models are loading"):
16
- # feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
17
- # dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
18
 
19
- # model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
20
- # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
21
-
22
- # SUBMIT_BUTTON = st.button("SUBMIT")
23
-
24
- # def extract_image(image, text, prob, num=1):
25
 
26
- # inputs = feature_extractor(images=image, return_tensors="pt")
27
- # outputs = dmodel(**inputs)
28
 
29
- # # model predicts bounding boxes and corresponding COCO classes
30
- # logits = outputs.logits
31
- # bboxes = outputs.pred_boxes
32
- # probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
33
 
34
- # keep = probas.max(-1).values > prob
35
- # outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
36
- # bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
37
- # labels = outs[0]['labels'][keep].detach().numpy()
38
- # scores = outs[0]['scores'][keep].detach().numpy()
39
 
40
- # images_list = []
41
- # for i,j in enumerate(bboxes_scaled):
42
 
43
- # xmin = int(j[0])
44
- # ymin = int(j[1])
45
- # xmax = int(j[2])
46
- # ymax = int(j[3])
47
 
48
- # im_arr = np.array(image)
49
- # roi = im_arr[ymin:ymax, xmin:xmax]
50
- # roi_im = Image.fromarray(roi)
51
 
52
- # images_list.append(roi_im)
53
 
54
- # inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
55
- # output = model(**inpu)
56
- # logits_per_image = output.logits_per_text
57
- # probs = logits_per_image.softmax(-1)
58
- # l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
59
 
60
- # final_ims = []
61
- # for i,j in enumerate(images_list):
62
- # json_dict = {}
63
- # if i in l_idx:
64
- # json_dict['image'] = images_list[i]
65
- # json_dict['score'] = probs[-1].detach().numpy()[i]
66
 
67
- # final_ims.append(json_dict)
68
 
69
- # fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
70
- # return fi[0]['image'], fi[0]['score']
71
 
72
- # if SUBMIT_BUTTON:
73
- # imageOutput, scoreOutput = extract(IMAGE_INPUT, TEXT_INPUT, NUMBER_INPUT)
74
- # st.image(imageOutput, caption="Cropped Image")
75
- # st.markdown("*Confidence Score:*")
76
- # st.success(scoreOutput)
 
 
1
+ import gradio as gr
2
  import numpy as np
3
  from PIL import Image
4
  from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
5
  import torch
6
 
7
+ i1 = gr.Image(type="pil", label="Input image")
8
+ i2 = gr.Textbox(label="Description for section to extracted")
9
+ i3 = gr.Number(value=0.96, label="Threshold percentage score")
10
+ o1 = gr.Image(type="pil", label="Extracted Crop part")
11
+ o2 = gr.Textbox(label="Similarity score")
12
 
 
 
 
13
 
14
+ feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
15
+ dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
16
 
17
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
18
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
 
19
 
20
+ def extract_image(image, text, prob, num=1):
 
 
 
 
 
21
 
22
+ inputs = feature_extractor(images=image, return_tensors="pt")
23
+ outputs = dmodel(**inputs)
24
 
25
+ # model predicts bounding boxes and corresponding COCO classes
26
+ logits = outputs.logits
27
+ bboxes = outputs.pred_boxes
28
+ probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
29
 
30
+ keep = probas.max(-1).values > prob
31
+ outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
32
+ bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
33
+ labels = outs[0]['labels'][keep].detach().numpy()
34
+ scores = outs[0]['scores'][keep].detach().numpy()
35
 
36
+ images_list = []
37
+ for i,j in enumerate(bboxes_scaled):
38
 
39
+ xmin = int(j[0])
40
+ ymin = int(j[1])
41
+ xmax = int(j[2])
42
+ ymax = int(j[3])
43
 
44
+ im_arr = np.array(image)
45
+ roi = im_arr[ymin:ymax, xmin:xmax]
46
+ roi_im = Image.fromarray(roi)
47
 
48
+ images_list.append(roi_im)
49
 
50
+ inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
51
+ output = model(**inpu)
52
+ logits_per_image = output.logits_per_text
53
+ probs = logits_per_image.softmax(-1)
54
+ l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
55
 
56
+ final_ims = []
57
+ for i,j in enumerate(images_list):
58
+ json_dict = {}
59
+ if i in l_idx:
60
+ json_dict['image'] = images_list[i]
61
+ json_dict['score'] = probs[-1].detach().numpy()[i]
62
 
63
+ final_ims.append(json_dict)
64
 
65
+ fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
66
+ return fi[0]['image'], fi[0]['score']
67
 
68
+ title = "ClipnCrop"
69
+ description = "<p style= 'color:white'>Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.</p>"
70
+ examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]]
71
+ article = "<p style= 'color:white; text-align:center;'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>"
72
+ gr_app = gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples)
73
+ gr_app.launch()