xelpmocAI commited on
Commit
f52050f
·
verified ·
1 Parent(s): 9749bd2

qwen update, per image, two output

Browse files
Files changed (1) hide show
  1. app.py +91 -33
app.py CHANGED
@@ -1,46 +1,104 @@
1
  import re
2
  import gradio as gr
3
 
 
 
4
  import torch
5
- from transformers import DonutProcessor, VisionEncoderDecoderModel
6
 
7
- processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
8
- model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
 
 
9
 
 
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
- def process_document(image):
14
 
15
- print(image)
16
- print(f"Type of Image {image}")
17
- # prepare encoder inputs
18
- pixel_values = processor(image, return_tensors="pt").pixel_values
19
-
20
- # prepare decoder inputs
21
- task_prompt = "<s_cord-v2>"
22
- decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
23
-
24
- # generate answer
25
- outputs = model.generate(
26
- pixel_values.to(device),
27
- decoder_input_ids=decoder_input_ids.to(device),
28
- max_length=model.decoder.config.max_position_embeddings,
29
- early_stopping=True,
30
- pad_token_id=processor.tokenizer.pad_token_id,
31
- eos_token_id=processor.tokenizer.eos_token_id,
32
- use_cache=True,
33
- num_beams=1,
34
- bad_words_ids=[[processor.tokenizer.unk_token_id]],
35
- return_dict_in_generate=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
-
38
- # postprocess
39
- sequence = processor.batch_decode(outputs.sequences)[0]
40
- sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
41
- sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
42
-
43
- return processor.token2json(sequence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # article = "<p style='text-align: center'><a href='https://www.xelpmoc.in/' target='_blank'>Made by Xelpmoc</a></p>"
46
 
@@ -51,7 +109,7 @@ demo = gr.Interface(
51
  title="Template-Free OCR model",
52
  # article=article,
53
  enable_queue=True,
54
- examples=[["example.png"], ["example_2.png"], ["example_3.png"]],
55
  cache_examples=False)
56
 
57
  demo.launch()
 
1
  import re
2
  import gradio as gr
3
 
4
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
5
+ from qwen_vl_utils import process_vision_info
6
  import torch
7
+ from ast import literal_eval
8
 
9
+ # default: Load the model on the available device(s)
10
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
11
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
12
+ )
13
 
14
+ # default processer
15
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
16
  device = "cuda" if torch.cuda.is_available() else "cpu"
17
  model.to(device)
18
 
 
19
 
20
+
21
+ other_benifits = '''Extract the following information in the given format:
22
+ {'other_benefits_and_information': {
23
+ '401k eru: {'This Period':'', 'Year-to-Date':''}},
24
+ 'quota summary':
25
+ {
26
+ 'sick:': '',
27
+ 'vacation:': '',
28
+ }
29
+ 'payment method': '',
30
+ 'Amount': ''
31
+ }
32
+ '''
33
+
34
+ tax_deductions = '''Extract the following information in the given format:
35
+ {
36
+ 'tax_deductions': {
37
+ 'federal:': {
38
+ 'withholding tax:': {'Amount':'', 'Year-To_Date':""},
39
+ 'ee social security tax:': {'Amount':'', 'Year-To_Date':""},
40
+ 'ee medicare tax:': {'Amount':'', 'Year-To_Date':""}},
41
+ 'california:': {
42
+ 'withholding tax:': {'Amount':'', 'Year-To_Date':""},
43
+ 'ee disability tax:': {'Amount':'', 'Year-To_Date':""}}},
44
+ }
45
+ '''
46
+
47
+ def demo(image_name, prompt):
48
+ messages = [
49
+ {
50
+ "role": "user",
51
+ "content": [
52
+ {
53
+ "type": "image",
54
+ "image": image_name,
55
+ },
56
+ {"type": "text", "text": prompt},
57
+ ],
58
+ }
59
+ ]
60
+
61
+ # Preparation for inference
62
+ text = processor.apply_chat_template(
63
+ messages, tokenize=False, add_generation_prompt=True
64
+ )
65
+ image_inputs, video_inputs = process_vision_info(messages)
66
+ inputs = processor(
67
+ text=[text],
68
+ images=image_inputs,
69
+ videos=video_inputs,
70
+ padding=True,
71
+ return_tensors="pt",
72
  )
73
+ inputs = inputs.to("cuda")
74
+
75
+ # Inference: Generation of the output
76
+ generated_ids = model.generate(**inputs, max_new_tokens=1500)
77
+ generated_ids_trimmed = [
78
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
79
+ ]
80
+ output_text = processor.batch_decode(
81
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
82
+ )
83
+ try:
84
+ # almost_json = output_text[0].replace('```\n', '').replace('\n```', '')
85
+ almost_json = output_text[0].split('```\n')[-1].split('\n```')[0]
86
+
87
+ json = literal_eval(almost_json)
88
+ except:
89
+ try:
90
+ # almost_json = output_text[0].replace('```json\n', '').replace('\n```', '')
91
+ almost_json = output_text[0].split('```json\n')[-1].split('\n```')[0]
92
+ json = literal_eval(almost_json)
93
+ except:
94
+ json = output_text[0]
95
+ return json
96
+
97
+ def process_document(image):
98
+ one = demo(image, other_benifits)
99
+ two = demo(image, tax_deductions)
100
+
101
+ return one, two
102
 
103
  # article = "<p style='text-align: center'><a href='https://www.xelpmoc.in/' target='_blank'>Made by Xelpmoc</a></p>"
104
 
 
109
  title="Template-Free OCR model",
110
  # article=article,
111
  enable_queue=True,
112
+ examples=[["example.png"], ["example_2.png"]],
113
  cache_examples=False)
114
 
115
  demo.launch()