artificialguybr commited on
Commit
3f49fe4
·
1 Parent(s): 03aebb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -16
app.py CHANGED
@@ -2,20 +2,21 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  from PIL import Image
 
5
  import requests
6
  from io import BytesIO
7
 
8
- # Load the Qwen-VL model and tokenizer
9
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
10
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()
11
 
12
  def generate_predictions(image_input, text_input):
13
- # Save the image locally to match the original example
14
  user_image_path = "/tmp/user_input_test_image.jpg"
15
- image_input.save(user_image_path)
16
- image_input = Image.open(user_image_path)
17
-
18
- # Prepare the inputs
19
  query = tokenizer.from_list_format([
20
  {'image': user_image_path},
21
  {'text': text_input},
@@ -23,20 +24,45 @@ def generate_predictions(image_input, text_input):
23
  inputs = tokenizer(query, return_tensors='pt')
24
  inputs = inputs.to(model.device)
25
 
26
- # Generate the caption
27
  pred = model.generate(**inputs)
28
- response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
 
 
 
29
 
30
- # Draw bounding boxes if any
31
- image_with_boxes = tokenizer.draw_bbox_on_latest_picture(response)
32
 
33
- return image_with_boxes, response
 
 
 
 
 
 
34
 
35
- # Create Gradio Interface
 
36
  iface = gr.Interface(
37
  fn=generate_predictions,
38
- inputs=["image", "text"],
39
- outputs=["image", "text"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
-
42
- iface.launch()
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  from PIL import Image
5
+ import re # Importando o módulo de expressões regulares
6
  import requests
7
  from io import BytesIO
8
 
9
+ # Carregar o modelo Qwen-VL e o tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
11
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()
12
 
13
  def generate_predictions(image_input, text_input):
14
+ # Inverter a imagem para corrigir o negativo
15
  user_image_path = "/tmp/user_input_test_image.jpg"
16
+ Image.fromarray((255 - (image_input * 255).astype('uint8'))).save(user_image_path)
17
+
18
+
19
+ # Preparar as entradas
20
  query = tokenizer.from_list_format([
21
  {'image': user_image_path},
22
  {'text': text_input},
 
24
  inputs = tokenizer(query, return_tensors='pt')
25
  inputs = inputs.to(model.device)
26
 
27
+ # Gerar a legenda
28
  pred = model.generate(**inputs)
29
+ full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
30
+
31
+ # Remover o texto de input e outras partes indesejadas da resposta completa
32
+ frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
33
 
34
+ # Desenhar caixas delimitadoras, se houver
35
+ image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
36
 
37
+ # Salvar e recarregar a imagem para garantir que seja uma imagem PIL
38
+ if image_with_boxes:
39
+ temp_path = "/tmp/image_with_boxes.jpg"
40
+ image_with_boxes.save(temp_path)
41
+ image_with_boxes = Image.open(temp_path)
42
+
43
+ return image_with_boxes, frontend_response # Retornando a resposta formatada para o frontend
44
 
45
+ # Criar interface Gradio
46
+ # Create Gradio interface
47
  iface = gr.Interface(
48
  fn=generate_predictions,
49
+ inputs=[
50
+ gr.inputs.Image(label="Image Input"),
51
+ gr.inputs.Textbox(default="Generate a caption for that image with grounding:", label="Prompt")
52
+ ],
53
+ outputs=[
54
+ gr.outputs.Image(type='pil', label="Image"), # Explicitly set type to 'pil'
55
+ gr.outputs.Textbox(label="Generated")
56
+ ],
57
+ title="Qwen-VL Demonstration",
58
+ description = """
59
+ ## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
60
+ **Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
61
+
62
+ ### Key Features:
63
+ - **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
64
+ - **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
65
+ - **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
66
+ """,
67
  )
68
+ iface.launch(share=True)