Spaces:

tensorlake
/

image-extractors

Runtime error

App Files Files Community

rishiraj commited on May 13, 2024

Commit

c04f965

1 Parent(s): 590aec3

add receipt extractor

Browse files

Files changed (7) hide show

__init__.py +0 -0
app.py +75 -0
extractors/__init__.py +0 -0
extractors/idefics2json/__init__.py +0 -0
extractors/idefics2json/parse_utils.py +53 -0
extractors/idefics2json/receipt_extractor.py +97 -0
requirements.txt +9 -0

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import spaces
+import gradio as gr
+from extractors.idefics2json.receipt_extractor import ReceiptExtractor
+from indexify_extractor_sdk import Content
+receipt_extractor = ReceiptExtractor()
+@spaces.GPU
+def img2json(image_filepath):
+	if image_filepath is None:
+		raise gr.Error("Please provide some input image: either upload an image file or use the camera")
+	with open(image_filepath, "rb") as f:
+		image_data = f.read()
+	content = Content(content_type="image/jpg", data=image_data)
+	result = receipt_extractor.extract(content)
+	text_content = next(content.data.decode('utf-8') for content in result)
+	return text_content
+with gr.Blocks(
+	title="Finetuned Idefics2 for Image to JSON with Indexify"
+) as receipt_demo:
+	gr.HTML("<h1 style='text-align: center'>Finetuned Idefics2 for Image to JSON with Indexify</h1>")
+	gr.HTML("<p style='text-align: center'>Indexify is a scalable realtime and continuous indexing and structured extraction engine for unstructured data to build generative AI applications</p>")
+	gr.HTML("<h3 style='text-align: center'>If you like this demo, please ⭐ Star us on <a href='https://github.com/tensorlakeai/indexify' target='_blank'>GitHub</a>!</h3>")
+	with gr.Row():
+		with gr.Column():
+			gr.HTML(
+				"<p><b>Step 1:</b> Upload an image file or capture with your camera.</p>"
+				"<p style='color: #A0A0A0;'>Use this demo for single image file only. "
+				"You can extract from image files continuously and try various other extractors locally with "
+				"<a href='https://getindexify.io/'>Indexify</a>.</p>"
+			)
+			image_file = gr.Image(sources=["webcam", "upload"], type="filepath")
+		with gr.Column():
+			gr.HTML("<p><b>Step 2:</b> Run the extractor.</p>")
+			go_button = gr.Button(
+				value="Run extractor",
+				variant="primary", # make "primary" so it stands out (default is "secondary")
+			)
+			model_output_text_box = gr.Textbox(
+				label="Extractor Output",
+				elem_id="model_output_text_box",
+			)
+	with gr.Row():
+		gr.HTML(
+			"<p style='text-align: center'>"
+				"Developed with 🫶 by <a href='https://getindexify.io/' target='_blank'>Indexify</a> | "
+				"a <a href='https://www.tensorlake.ai/' target='_blank'>Tensorlake</a> product"
+			"</p>"
+		)
+	go_button.click(
+		fn=img2json,
+		inputs = [image_file],
+		outputs = [model_output_text_box]
+	)
+demo = gr.TabbedInterface([receipt_demo], ["Receipt Extraction"], theme=gr.themes.Soft())
+demo.queue()
+demo.launch()

extractors/__init__.py ADDED Viewed

File without changes

extractors/idefics2json/__init__.py ADDED Viewed

File without changes

extractors/idefics2json/parse_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import re
+# let's turn that into JSON
+def token2json(tokens, added_vocab, is_inner_value=False):
+        """
+        Convert a (generated) token sequence into an ordered JSON format.
+        """
+        output = {}
+        while tokens:
+            start_token = re.search(r"<s_(.*?)>", tokens, re.IGNORECASE)
+            if start_token is None:
+                break
+            key = start_token.group(1)
+            key_escaped = re.escape(key)
+            end_token = re.search(rf"</s_{key_escaped}>", tokens, re.IGNORECASE)
+            start_token = start_token.group()
+            if end_token is None:
+                tokens = tokens.replace(start_token, "")
+            else:
+                end_token = end_token.group()
+                start_token_escaped = re.escape(start_token)
+                end_token_escaped = re.escape(end_token)
+                content = re.search(
+                    f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
+                )
+                if content is not None:
+                    content = content.group(1).strip()
+                    if r"<s_" in content and r"</s_" in content:  # non-leaf node
+                        value = token2json(content, is_inner_value=True, added_vocab=added_vocab)
+                        if value:
+                            if len(value) == 1:
+                                value = value[0]
+                            output[key] = value
+                    else:  # leaf nodes
+                        output[key] = []
+                        for leaf in content.split(r"<sep/>"):
+                            leaf = leaf.strip()
+                            if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>":
+                                leaf = leaf[1:-2]  # for categorical special tokens
+                            output[key].append(leaf)
+                        if len(output[key]) == 1:
+                            output[key] = output[key][0]
+                tokens = tokens[tokens.find(end_token) + len(end_token) :].strip()
+                if tokens[:6] == r"<sep/>":  # non-leaf nodes
+                    return [output] + token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab)
+        if len(output):
+            return [output] if is_inner_value else output
+        else:
+            return [] if is_inner_value else {"text_sequence": tokens}

extractors/idefics2json/receipt_extractor.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import logging
+import torch
+import os
+from PIL import Image
+from io import BytesIO
+from transformers import AutoProcessor, Idefics2ForConditionalGeneration
+from huggingface_hub import hf_hub_download
+from indexify_extractor_sdk import Content, Extractor, Feature
+from .parse_utils import token2json
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings
+from typing import Optional, Literal, List, Union
+logger = logging.getLogger(__name__)
+token = os.getenv('HF_TOKEN')
+class ModelSettings(BaseSettings):
+    peft_model_id: str = "nielsr/idefics2-cord-demo"
+    hf_token: Optional[str] = token
+model_settings = ModelSettings()
+class ReceiptExtractor(Extractor):
+    name = "tensorlake/idefics2json"
+    description = "Finetuned Idefics2 for Image to JSON."
+    system_dependencies = []
+    input_mime_types = ["image/jpeg", "image/png"]
+    def __init__(self):
+        super(ReceiptExtractor, self).__init__()
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        logger.info(f"Using device: {device.type}")
+        torch_dtype = torch.float32 if device.type == "cpu" else torch.float16
+        self.processor = AutoProcessor.from_pretrained(model_settings.peft_model_id)
+        # Load the base model with adapters on top
+        self.model = Idefics2ForConditionalGeneration.from_pretrained(
+            model_settings.peft_model_id,
+            torch_dtype=torch_dtype,
+        )
+        # get the resized input embeddings
+        filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="input_embeddings.pt", repo_type="dataset")
+        input_embeddings = torch.load(filepath, map_location="cpu")
+        input_embeddings_module = torch.nn.Embedding(input_embeddings.shape[0], input_embeddings.shape[1], _weight=input_embeddings)
+        # set the resized output embeddings
+        filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="output_embeddings.pt", repo_type="dataset")
+        output_embeddings = torch.load(filepath, map_location="cpu")
+        output_embeddings_module = torch.nn.Linear(output_embeddings.shape[0], output_embeddings.shape[1], bias=False)
+        output_embeddings_module.weight = output_embeddings
+        # set them accordingly
+        self.model.resize_token_embeddings(len(self.processor.tokenizer))
+        self.model.set_input_embeddings(input_embeddings_module)
+        self.model.set_output_embeddings(output_embeddings_module)
+        self.model.to("cuda")
+    def extract(self, content: Content, params = None) -> List[Union[Feature, Content]]:
+        image = Image.open(BytesIO(content.data))
+        # prepare image and prompt for the model
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Extract JSON."},
+                    {"type": "image"},
+                ]
+            },
+        ]
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to("cuda")
+        # Generate token IDs
+        generated_ids = self.model.generate(**inputs, max_new_tokens=768)
+        # Decode back into text
+        generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+        added_vocab = self.processor.tokenizer.get_added_vocab()
+        generated_json = token2json(generated_texts[0], added_vocab)
+        return [Content.from_text(str(generated_json))]
+    def sample_input(self) -> Content:
+        filepath = "sample.jpg"
+        with open(filepath, 'rb') as f:
+            image_data = f.read()
+        return Content(content_type="image/jpg", data=image_data)
+if __name__ == "__main__":
+    filepath = "sample.jpg"
+    with open(filepath, 'rb') as f:
+        image_data = f.read()
+    data = Content(content_type="image/jpg", data=image_data)
+    extractor = ReceiptExtractor()
+    results = extractor.extract(data)
+    print(results)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+indexify-extractor-sdk
+accelerate==0.27.2
+transformers==4.40.2
+numpy==1.26.4
+pydantic==2.6.3
+pydantic-settings==2.2.1
+torch==2.2.0
+bitsandbytes
+peft