seawolf2357 commited on
Commit
1e442f4
Β·
verified Β·
1 Parent(s): dedab71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -27
app.py CHANGED
@@ -4,37 +4,25 @@ from gradio_client import Client # κ°€μ •: gradio_client λΌμ΄λΈŒλŸ¬λ¦¬κ°€ 사
4
  import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
5
  import torch
6
  from PIL import Image
 
7
 
8
- model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
- feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
 
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device)
14
 
15
- max_length = 16
16
- num_beams = 4
17
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
18
- def predict_step(image_paths):
19
- images = []
20
- for image_path in image_paths:
21
- i_image = Image.open(image_path)
22
- if i_image.mode != "RGB":
23
- i_image = i_image.convert(mode="RGB")
24
 
25
- images.append(i_image)
26
-
27
- pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
28
- pixel_values = pixel_values.to(device)
29
-
30
- output_ids = model.generate(pixel_values, **gen_kwargs)
31
-
32
- preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
33
- preds = [pred.strip() for pred in preds]
34
- return preds
35
-
36
-
37
- predict_step(pipeline)
38
 
39
 
40
  # 이미지 인식 νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
@@ -81,13 +69,13 @@ def classify_and_generate_voice(uploaded_image):
81
 
82
  # λ°˜ν™˜λœ μŒμ„± 및 μŒμ•… κ²°κ³Όλ₯Ό Gradio μΈν„°νŽ˜μ΄μŠ€λ‘œ 전달
83
  # 예: voice_result['url'] λ˜λŠ” voice_result['audio_data'] λ“±
84
- return top_prediction, voice_result, music_result
85
 
86
  # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
87
  iface = gr.Interface(
88
  fn=classify_and_generate_voice,
89
  inputs=gr.Image(type="pil"),
90
- outputs=[gr.Label(), gr.Audio(), gr.Audio()],
91
  title="msVision_3",
92
  description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄, 사물을 μΈμ‹ν•˜κ³  ν•΄λ‹Ήν•˜λŠ” μŒμ„± 및 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€.(recognizes object and generate Voice&Music)",
93
  examples=["dog.jpg", "cat.png", "cafe.jpg"]
 
4
  import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
5
  import torch
6
  from PIL import Image
7
+ import requests
8
 
9
+ # λͺ¨λΈκ³Ό ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ
10
+ model_id = "nlpconnect/vit-gpt2-image-captioning"
11
+ model = VisionEncoderDecoderModel.from_pretrained(model_id)
12
+ feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
13
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
14
 
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  model.to(device)
17
 
18
+ def predict_caption(image):
19
+ image = Image.open(image)
20
+ inputs = feature_extractor(images=image, return_tensors="pt")
21
+ pixel_values = inputs["pixel_values"].to(device)
 
 
 
 
 
22
 
23
+ output_ids = model.generate(pixel_values)
24
+ caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
25
+ return caption
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  # 이미지 인식 νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
 
69
 
70
  # λ°˜ν™˜λœ μŒμ„± 및 μŒμ•… κ²°κ³Όλ₯Ό Gradio μΈν„°νŽ˜μ΄μŠ€λ‘œ 전달
71
  # 예: voice_result['url'] λ˜λŠ” voice_result['audio_data'] λ“±
72
+ return caption, top_prediction, voice_result, music_result
73
 
74
  # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
75
  iface = gr.Interface(
76
  fn=classify_and_generate_voice,
77
  inputs=gr.Image(type="pil"),
78
+ outputs=[gr.Textbox(label="Caption"), gr.Label(), gr.Audio(), gr.Audio()],
79
  title="msVision_3",
80
  description="이미지λ₯Ό μ—…λ‘œλ“œν•˜λ©΄, 사물을 μΈμ‹ν•˜κ³  ν•΄λ‹Ήν•˜λŠ” μŒμ„± 및 μŒμ•…μ„ μƒμ„±ν•©λ‹ˆλ‹€.(recognizes object and generate Voice&Music)",
81
  examples=["dog.jpg", "cat.png", "cafe.jpg"]