Spaces:
Runtime error
Runtime error
add receipt extractor
Browse files- __init__.py +0 -0
- app.py +75 -0
- extractors/__init__.py +0 -0
- extractors/idefics2json/__init__.py +0 -0
- extractors/idefics2json/parse_utils.py +53 -0
- extractors/idefics2json/receipt_extractor.py +97 -0
- requirements.txt +9 -0
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
import gradio as gr
|
3 |
+
from extractors.idefics2json.receipt_extractor import ReceiptExtractor
|
4 |
+
from indexify_extractor_sdk import Content
|
5 |
+
|
6 |
+
receipt_extractor = ReceiptExtractor()
|
7 |
+
|
8 |
+
@spaces.GPU
|
9 |
+
def img2json(image_filepath):
|
10 |
+
if image_filepath is None:
|
11 |
+
raise gr.Error("Please provide some input image: either upload an image file or use the camera")
|
12 |
+
|
13 |
+
with open(image_filepath, "rb") as f:
|
14 |
+
image_data = f.read()
|
15 |
+
|
16 |
+
content = Content(content_type="image/jpg", data=image_data)
|
17 |
+
|
18 |
+
result = receipt_extractor.extract(content)
|
19 |
+
text_content = next(content.data.decode('utf-8') for content in result)
|
20 |
+
|
21 |
+
return text_content
|
22 |
+
|
23 |
+
with gr.Blocks(
|
24 |
+
title="Finetuned Idefics2 for Image to JSON with Indexify"
|
25 |
+
) as receipt_demo:
|
26 |
+
|
27 |
+
gr.HTML("<h1 style='text-align: center'>Finetuned Idefics2 for Image to JSON with Indexify</h1>")
|
28 |
+
gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
|
29 |
+
gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
|
30 |
+
|
31 |
+
with gr.Row():
|
32 |
+
with gr.Column():
|
33 |
+
gr.HTML(
|
34 |
+
"<p><b>Step 1:</b> Upload an image file or capture with your camera.</p>"
|
35 |
+
|
36 |
+
"<p style='color: #A0A0A0;'>Use this demo for single image file only. "
|
37 |
+
"You can extract from image files continuously and try various other extractors locally with "
|
38 |
+
"<a href='https://getindexify.io/'>Indexify</a>.</p>"
|
39 |
+
)
|
40 |
+
|
41 |
+
image_file = gr.Image(sources=["webcam", "upload"], type="filepath")
|
42 |
+
|
43 |
+
with gr.Column():
|
44 |
+
|
45 |
+
gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
|
46 |
+
|
47 |
+
go_button = gr.Button(
|
48 |
+
value="Run extractor",
|
49 |
+
variant="primary", # make "primary" so it stands out (default is "secondary")
|
50 |
+
)
|
51 |
+
|
52 |
+
model_output_text_box = gr.Textbox(
|
53 |
+
label="Extractor Output",
|
54 |
+
elem_id="model_output_text_box",
|
55 |
+
)
|
56 |
+
|
57 |
+
with gr.Row():
|
58 |
+
|
59 |
+
gr.HTML(
|
60 |
+
"<p style='text-align: center'>"
|
61 |
+
"Developed with 🫶 by <a href='https://getindexify.io/' target='_blank'>Indexify</a> | "
|
62 |
+
"a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
|
63 |
+
"</p>"
|
64 |
+
)
|
65 |
+
|
66 |
+
go_button.click(
|
67 |
+
fn=img2json,
|
68 |
+
inputs = [image_file],
|
69 |
+
outputs = [model_output_text_box]
|
70 |
+
)
|
71 |
+
|
72 |
+
demo = gr.TabbedInterface([receipt_demo], ["Receipt Extraction"], theme=gr.themes.Soft())
|
73 |
+
|
74 |
+
demo.queue()
|
75 |
+
demo.launch()
|
extractors/__init__.py
ADDED
File without changes
|
extractors/idefics2json/__init__.py
ADDED
File without changes
|
extractors/idefics2json/parse_utils.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
# let's turn that into JSON
|
4 |
+
def token2json(tokens, added_vocab, is_inner_value=False):
|
5 |
+
"""
|
6 |
+
Convert a (generated) token sequence into an ordered JSON format.
|
7 |
+
"""
|
8 |
+
output = {}
|
9 |
+
|
10 |
+
while tokens:
|
11 |
+
start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
|
12 |
+
if start_token is None:
|
13 |
+
break
|
14 |
+
key = start_token.group(1)
|
15 |
+
key_escaped = re.escape(key)
|
16 |
+
|
17 |
+
end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
|
18 |
+
start_token = start_token.group()
|
19 |
+
if end_token is None:
|
20 |
+
tokens = tokens.replace(start_token, "")
|
21 |
+
else:
|
22 |
+
end_token = end_token.group()
|
23 |
+
start_token_escaped = re.escape(start_token)
|
24 |
+
end_token_escaped = re.escape(end_token)
|
25 |
+
content = re.search(
|
26 |
+
f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
|
27 |
+
)
|
28 |
+
if content is not None:
|
29 |
+
content = content.group(1).strip()
|
30 |
+
if r"<s_" in content and r"</s_" in content: # non-leaf node
|
31 |
+
value = token2json(content, is_inner_value=True, added_vocab=added_vocab)
|
32 |
+
if value:
|
33 |
+
if len(value) == 1:
|
34 |
+
value = value[0]
|
35 |
+
output[key] = value
|
36 |
+
else: # leaf nodes
|
37 |
+
output[key] = []
|
38 |
+
for leaf in content.split(r"<sep/>"):
|
39 |
+
leaf = leaf.strip()
|
40 |
+
if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
|
41 |
+
leaf = leaf[1:-2] # for categorical special tokens
|
42 |
+
output[key].append(leaf)
|
43 |
+
if len(output[key]) == 1:
|
44 |
+
output[key] = output[key][0]
|
45 |
+
|
46 |
+
tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
|
47 |
+
if tokens[:6] == r"<sep/>": # non-leaf nodes
|
48 |
+
return [output] + token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
|
49 |
+
|
50 |
+
if len(output):
|
51 |
+
return [output] if is_inner_value else output
|
52 |
+
else:
|
53 |
+
return [] if is_inner_value else {"text_sequence": tokens}
|
extractors/idefics2json/receipt_extractor.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
from PIL import Image
|
5 |
+
from io import BytesIO
|
6 |
+
|
7 |
+
from transformers import AutoProcessor, Idefics2ForConditionalGeneration
|
8 |
+
from huggingface_hub import hf_hub_download
|
9 |
+
from indexify_extractor_sdk import Content, Extractor, Feature
|
10 |
+
from .parse_utils import token2json
|
11 |
+
|
12 |
+
from pydantic import BaseModel
|
13 |
+
from pydantic_settings import BaseSettings
|
14 |
+
from typing import Optional, Literal, List, Union
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
token = os.getenv('HF_TOKEN')
|
18 |
+
|
19 |
+
class ModelSettings(BaseSettings):
|
20 |
+
peft_model_id: str = "nielsr/idefics2-cord-demo"
|
21 |
+
hf_token: Optional[str] = token
|
22 |
+
|
23 |
+
model_settings = ModelSettings()
|
24 |
+
|
25 |
+
class ReceiptExtractor(Extractor):
|
26 |
+
name = "tensorlake/idefics2json"
|
27 |
+
description = "Finetuned Idefics2 for Image to JSON."
|
28 |
+
system_dependencies = []
|
29 |
+
input_mime_types = ["image/jpeg", "image/png"]
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
super(ReceiptExtractor, self).__init__()
|
33 |
+
|
34 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
35 |
+
logger.info(f"Using device: {device.type}")
|
36 |
+
torch_dtype = torch.float32 if device.type == "cpu" else torch.float16
|
37 |
+
|
38 |
+
self.processor = AutoProcessor.from_pretrained(model_settings.peft_model_id)
|
39 |
+
# Load the base model with adapters on top
|
40 |
+
self.model = Idefics2ForConditionalGeneration.from_pretrained(
|
41 |
+
model_settings.peft_model_id,
|
42 |
+
torch_dtype=torch_dtype,
|
43 |
+
)
|
44 |
+
# get the resized input embeddings
|
45 |
+
filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="input_embeddings.pt", repo_type="dataset")
|
46 |
+
input_embeddings = torch.load(filepath, map_location="cpu")
|
47 |
+
input_embeddings_module = torch.nn.Embedding(input_embeddings.shape[0], input_embeddings.shape[1], _weight=input_embeddings)
|
48 |
+
# set the resized output embeddings
|
49 |
+
filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="output_embeddings.pt", repo_type="dataset")
|
50 |
+
output_embeddings = torch.load(filepath, map_location="cpu")
|
51 |
+
output_embeddings_module = torch.nn.Linear(output_embeddings.shape[0], output_embeddings.shape[1], bias=False)
|
52 |
+
output_embeddings_module.weight = output_embeddings
|
53 |
+
|
54 |
+
# set them accordingly
|
55 |
+
self.model.resize_token_embeddings(len(self.processor.tokenizer))
|
56 |
+
self.model.set_input_embeddings(input_embeddings_module)
|
57 |
+
self.model.set_output_embeddings(output_embeddings_module)
|
58 |
+
self.model.to("cuda")
|
59 |
+
|
60 |
+
def extract(self, content: Content, params = None) -> List[Union[Feature, Content]]:
|
61 |
+
image = Image.open(BytesIO(content.data))
|
62 |
+
# prepare image and prompt for the model
|
63 |
+
messages = [
|
64 |
+
{
|
65 |
+
"role": "user",
|
66 |
+
"content": [
|
67 |
+
{"type": "text", "text": "Extract JSON."},
|
68 |
+
{"type": "image"},
|
69 |
+
]
|
70 |
+
},
|
71 |
+
]
|
72 |
+
prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
|
73 |
+
inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to("cuda")
|
74 |
+
|
75 |
+
# Generate token IDs
|
76 |
+
generated_ids = self.model.generate(**inputs, max_new_tokens=768)
|
77 |
+
|
78 |
+
# Decode back into text
|
79 |
+
generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
|
80 |
+
added_vocab = self.processor.tokenizer.get_added_vocab()
|
81 |
+
generated_json = token2json(generated_texts[0], added_vocab)
|
82 |
+
return [Content.from_text(str(generated_json))]
|
83 |
+
|
84 |
+
def sample_input(self) -> Content:
|
85 |
+
filepath = "sample.jpg"
|
86 |
+
with open(filepath, 'rb') as f:
|
87 |
+
image_data = f.read()
|
88 |
+
return Content(content_type="image/jpg", data=image_data)
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
filepath = "sample.jpg"
|
92 |
+
with open(filepath, 'rb') as f:
|
93 |
+
image_data = f.read()
|
94 |
+
data = Content(content_type="image/jpg", data=image_data)
|
95 |
+
extractor = ReceiptExtractor()
|
96 |
+
results = extractor.extract(data)
|
97 |
+
print(results)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
indexify-extractor-sdk
|
2 |
+
accelerate==0.27.2
|
3 |
+
transformers==4.40.2
|
4 |
+
numpy==1.26.4
|
5 |
+
pydantic==2.6.3
|
6 |
+
pydantic-settings==2.2.1
|
7 |
+
torch==2.2.0
|
8 |
+
bitsandbytes
|
9 |
+
peft
|