Fiqa commited on
Commit
50b4bab
·
verified ·
1 Parent(s): 2e8a68a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -4
app.py CHANGED
@@ -8,8 +8,9 @@ from diffusers import DiffusionPipeline
8
  import torch
9
  import spaces # Hugging Face Spaces module
10
 
11
- import easyocr
12
  import requests
 
 
13
 
14
 
15
 
@@ -25,12 +26,20 @@ if not hf_token:
25
  raise ValueError("Hugging Face token is not set in the environment variables.")
26
  login(token=hf_token)
27
 
 
 
28
  # Load the processor and model
29
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
30
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
31
  processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
32
  model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
33
  pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 
 
 
 
 
 
34
 
35
 
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -46,10 +55,40 @@ def generate_caption_and_image(image):
46
  # reader = easyocr.Reader(['en'])
47
  # result = reader.readtext(img)
48
  import random
49
- reader = easyocr.Reader(['ur', 'eng'], gpu =False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Read text from an image
52
- result = reader.readtext(img)
53
 
54
 
55
  # Define lists for the three variables
 
8
  import torch
9
  import spaces # Hugging Face Spaces module
10
 
 
11
  import requests
12
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
13
+ from qwen_vl_utils import process_vision_info
14
 
15
 
16
 
 
26
  raise ValueError("Hugging Face token is not set in the environment variables.")
27
  login(token=hf_token)
28
 
29
+
30
+
31
  # Load the processor and model
32
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
33
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
34
  processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
35
  model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
36
  pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
37
+ model3 =model = Qwen2VLForConditionalGeneration.from_pretrained(
38
+ "prithivMLmods/Qwen2-VL-OCR-2B-Instruct", torch_dtype="auto", device_map="auto"
39
+ )
40
+ processor2 = AutoProcessor.from_pretrained("prithivMLmods/Qwen2-VL-OCR-2B-Instruct")
41
+
42
+
43
 
44
 
45
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
55
  # reader = easyocr.Reader(['en'])
56
  # result = reader.readtext(img)
57
  import random
58
+ messages = [
59
+ {
60
+ "role": "user",
61
+ "content": [
62
+ {
63
+ "type": "image",
64
+ "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
65
+ },
66
+ {"type": "text", "text": img},
67
+ ],
68
+ }
69
+ ]
70
+ text = processor.apply_chat_template(
71
+ messages, tokenize=False, add_generation_prompt=True
72
+ )
73
+ image_inputs, video_inputs = process_vision_info(messages)
74
+ inputs = processor(
75
+ text=[text],
76
+ images=image_inputs,
77
+ videos=video_inputs,
78
+ padding=True,
79
+ return_tensors="pt",
80
+ )
81
+ inputs = inputs.to("cuda")
82
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
83
+ generated_ids_trimmed = [
84
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
85
+ ]
86
+ result = processor.batch_decode(
87
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
88
+ )
89
+
90
+
91
 
 
 
92
 
93
 
94
  # Define lists for the three variables