rishiraj commited on
Commit
c04f965
·
1 Parent(s): 590aec3

add receipt extractor

Browse files
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ from extractors.idefics2json.receipt_extractor import ReceiptExtractor
4
+ from indexify_extractor_sdk import Content
5
+
6
+ receipt_extractor = ReceiptExtractor()
7
+
8
+ @spaces.GPU
9
+ def img2json(image_filepath):
10
+ if image_filepath is None:
11
+ raise gr.Error("Please provide some input image: either upload an image file or use the camera")
12
+
13
+ with open(image_filepath, "rb") as f:
14
+ image_data = f.read()
15
+
16
+ content = Content(content_type="image/jpg", data=image_data)
17
+
18
+ result = receipt_extractor.extract(content)
19
+ text_content = next(content.data.decode('utf-8') for content in result)
20
+
21
+ return text_content
22
+
23
+ with gr.Blocks(
24
+ title="Finetuned Idefics2 for Image to JSON with Indexify"
25
+ ) as receipt_demo:
26
+
27
+ gr.HTML("<h1 style='text-align: center'>Finetuned Idefics2 for Image to JSON with Indexify</h1>")
28
+ gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
29
+ gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
30
+
31
+ with gr.Row():
32
+ with gr.Column():
33
+ gr.HTML(
34
+ "<p><b>Step 1:</b> Upload an image file or capture with your camera.</p>"
35
+
36
+ "<p style='color: #A0A0A0;'>Use this demo for single image file only. "
37
+ "You can extract from image files continuously and try various other extractors locally with "
38
+ "<a href='https://getindexify.io/'>Indexify</a>.</p>"
39
+ )
40
+
41
+ image_file = gr.Image(sources=["webcam", "upload"], type="filepath")
42
+
43
+ with gr.Column():
44
+
45
+ gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
46
+
47
+ go_button = gr.Button(
48
+ value="Run extractor",
49
+ variant="primary", # make "primary" so it stands out (default is "secondary")
50
+ )
51
+
52
+ model_output_text_box = gr.Textbox(
53
+ label="Extractor Output",
54
+ elem_id="model_output_text_box",
55
+ )
56
+
57
+ with gr.Row():
58
+
59
+ gr.HTML(
60
+ "<p style='text-align: center'>"
61
+ "Developed with 🫶 by <a href='https://getindexify.io/' target='_blank'>Indexify</a> | "
62
+ "a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
63
+ "</p>"
64
+ )
65
+
66
+ go_button.click(
67
+ fn=img2json,
68
+ inputs = [image_file],
69
+ outputs = [model_output_text_box]
70
+ )
71
+
72
+ demo = gr.TabbedInterface([receipt_demo], ["Receipt Extraction"], theme=gr.themes.Soft())
73
+
74
+ demo.queue()
75
+ demo.launch()
extractors/__init__.py ADDED
File without changes
extractors/idefics2json/__init__.py ADDED
File without changes
extractors/idefics2json/parse_utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # let's turn that into JSON
4
+ def token2json(tokens, added_vocab, is_inner_value=False):
5
+ """
6
+ Convert a (generated) token sequence into an ordered JSON format.
7
+ """
8
+ output = {}
9
+
10
+ while tokens:
11
+ start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
12
+ if start_token is None:
13
+ break
14
+ key = start_token.group(1)
15
+ key_escaped = re.escape(key)
16
+
17
+ end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
18
+ start_token = start_token.group()
19
+ if end_token is None:
20
+ tokens = tokens.replace(start_token, "")
21
+ else:
22
+ end_token = end_token.group()
23
+ start_token_escaped = re.escape(start_token)
24
+ end_token_escaped = re.escape(end_token)
25
+ content = re.search(
26
+ f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
27
+ )
28
+ if content is not None:
29
+ content = content.group(1).strip()
30
+ if r"<s_" in content and r"</s_" in content: # non-leaf node
31
+ value = token2json(content, is_inner_value=True, added_vocab=added_vocab)
32
+ if value:
33
+ if len(value) == 1:
34
+ value = value[0]
35
+ output[key] = value
36
+ else: # leaf nodes
37
+ output[key] = []
38
+ for leaf in content.split(r"<sep/>"):
39
+ leaf = leaf.strip()
40
+ if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
41
+ leaf = leaf[1:-2] # for categorical special tokens
42
+ output[key].append(leaf)
43
+ if len(output[key]) == 1:
44
+ output[key] = output[key][0]
45
+
46
+ tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
47
+ if tokens[:6] == r"<sep/>": # non-leaf nodes
48
+ return [output] + token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
49
+
50
+ if len(output):
51
+ return [output] if is_inner_value else output
52
+ else:
53
+ return [] if is_inner_value else {"text_sequence": tokens}
extractors/idefics2json/receipt_extractor.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ import os
4
+ from PIL import Image
5
+ from io import BytesIO
6
+
7
+ from transformers import AutoProcessor, Idefics2ForConditionalGeneration
8
+ from huggingface_hub import hf_hub_download
9
+ from indexify_extractor_sdk import Content, Extractor, Feature
10
+ from .parse_utils import token2json
11
+
12
+ from pydantic import BaseModel
13
+ from pydantic_settings import BaseSettings
14
+ from typing import Optional, Literal, List, Union
15
+
16
+ logger = logging.getLogger(__name__)
17
+ token = os.getenv('HF_TOKEN')
18
+
19
+ class ModelSettings(BaseSettings):
20
+ peft_model_id: str = "nielsr/idefics2-cord-demo"
21
+ hf_token: Optional[str] = token
22
+
23
+ model_settings = ModelSettings()
24
+
25
+ class ReceiptExtractor(Extractor):
26
+ name = "tensorlake/idefics2json"
27
+ description = "Finetuned Idefics2 for Image to JSON."
28
+ system_dependencies = []
29
+ input_mime_types = ["image/jpeg", "image/png"]
30
+
31
+ def __init__(self):
32
+ super(ReceiptExtractor, self).__init__()
33
+
34
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
35
+ logger.info(f"Using device: {device.type}")
36
+ torch_dtype = torch.float32 if device.type == "cpu" else torch.float16
37
+
38
+ self.processor = AutoProcessor.from_pretrained(model_settings.peft_model_id)
39
+ # Load the base model with adapters on top
40
+ self.model = Idefics2ForConditionalGeneration.from_pretrained(
41
+ model_settings.peft_model_id,
42
+ torch_dtype=torch_dtype,
43
+ )
44
+ # get the resized input embeddings
45
+ filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="input_embeddings.pt", repo_type="dataset")
46
+ input_embeddings = torch.load(filepath, map_location="cpu")
47
+ input_embeddings_module = torch.nn.Embedding(input_embeddings.shape[0], input_embeddings.shape[1], _weight=input_embeddings)
48
+ # set the resized output embeddings
49
+ filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="output_embeddings.pt", repo_type="dataset")
50
+ output_embeddings = torch.load(filepath, map_location="cpu")
51
+ output_embeddings_module = torch.nn.Linear(output_embeddings.shape[0], output_embeddings.shape[1], bias=False)
52
+ output_embeddings_module.weight = output_embeddings
53
+
54
+ # set them accordingly
55
+ self.model.resize_token_embeddings(len(self.processor.tokenizer))
56
+ self.model.set_input_embeddings(input_embeddings_module)
57
+ self.model.set_output_embeddings(output_embeddings_module)
58
+ self.model.to("cuda")
59
+
60
+ def extract(self, content: Content, params = None) -> List[Union[Feature, Content]]:
61
+ image = Image.open(BytesIO(content.data))
62
+ # prepare image and prompt for the model
63
+ messages = [
64
+ {
65
+ "role": "user",
66
+ "content": [
67
+ {"type": "text", "text": "Extract JSON."},
68
+ {"type": "image"},
69
+ ]
70
+ },
71
+ ]
72
+ prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
73
+ inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to("cuda")
74
+
75
+ # Generate token IDs
76
+ generated_ids = self.model.generate(**inputs, max_new_tokens=768)
77
+
78
+ # Decode back into text
79
+ generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
80
+ added_vocab = self.processor.tokenizer.get_added_vocab()
81
+ generated_json = token2json(generated_texts[0], added_vocab)
82
+ return [Content.from_text(str(generated_json))]
83
+
84
+ def sample_input(self) -> Content:
85
+ filepath = "sample.jpg"
86
+ with open(filepath, 'rb') as f:
87
+ image_data = f.read()
88
+ return Content(content_type="image/jpg", data=image_data)
89
+
90
+ if __name__ == "__main__":
91
+ filepath = "sample.jpg"
92
+ with open(filepath, 'rb') as f:
93
+ image_data = f.read()
94
+ data = Content(content_type="image/jpg", data=image_data)
95
+ extractor = ReceiptExtractor()
96
+ results = extractor.extract(data)
97
+ print(results)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ indexify-extractor-sdk
2
+ accelerate==0.27.2
3
+ transformers==4.40.2
4
+ numpy==1.26.4
5
+ pydantic==2.6.3
6
+ pydantic-settings==2.2.1
7
+ torch==2.2.0
8
+ bitsandbytes
9
+ peft