naivas-lpos

Running

waceke commited on Sep 8, 2023

Commit

ccc2402

1 Parent(s): c2d58b3

Removed the sparrow key since it wasn't needed. Changed the model and processor to senga model and processor (#1)

- Removed the sparrow key since it wasn't needed. Changed the model and processor to senga model and processor (391b6558c7e79b684f3e21c97063e16e97b5ef18)

Files changed (3) hide show

config.py +4 -6
inference.py +85 -0
training.py +85 -0

config.py CHANGED Viewed

@@ -1,13 +1,11 @@
 from pydantic import BaseSettings
-import os
 class Settings(BaseSettings):
-    huggingface_key: str = os.environ.get("huggingface_key")
-    sparrow_key: str = os.environ.get("sparrow_key")
-    processor: str = "katanaml-org/invoices-donut-model-v1"
-    model: str = "katanaml-org/invoices-donut-model-v1"
-    dataset: str = "katanaml-org/invoices-donut-data-v1"
     base_config: str = "naver-clova-ix/donut-base"
     base_processor: str = "naver-clova-ix/donut-base"
     base_model: str = "naver-clova-ix/donut-base"

 from pydantic import BaseSettings
 class Settings(BaseSettings):
+    huggingface_key: str = "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"
+    processor: str = "senga-ml/donut-training-v4"
+    model: str = "senga-ml/donut-training-v4"
+    dataset: str = "senga-ml/dnotes-data-v1"
     base_config: str = "naver-clova-ix/donut-base"
     base_processor: str = "naver-clova-ix/donut-base"
     base_model: str = "naver-clova-ix/donut-base"

inference.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from fastapi import APIRouter, File, UploadFile, Form
+from typing import Optional
+from PIL import Image
+import urllib.request
+from io import BytesIO
+from config import settings
+import utils
+import os
+import json
+from routers.donut_inference import process_document_donut
+router = APIRouter()
+def count_values(obj):
+    if isinstance(obj, dict):
+        count = 0
+        for value in obj.values():
+            count += count_values(value)
+        return count
+    elif isinstance(obj, list):
+        count = 0
+        for item in obj:
+            count += count_values(item)
+        return count
+    else:
+        return 1
+@router.post("/inference")
+async def run_inference(file: Optional[UploadFile] = File(None), image_url: Optional[str] = Form(None),
+                        model_in_use: str = Form('donut')):
+    # if sparrow_key != settings.sparrow_key:
+    #     return {"error": "Invalid Sparrow key."}
+    result = []
+    if file:
+        # Ensure the uploaded file is a JPG image
+        if file.content_type not in ["image/jpeg", "image/jpg"]:
+            return {"error": "Invalid file type. Only JPG images are allowed."}
+        image = Image.open(BytesIO(await file.read()))
+        processing_time = 0
+        if model_in_use == 'donut':
+            result, processing_time = process_document_donut(image)
+        utils.log_stats(settings.inference_stats_file, [processing_time, count_values(result), file.filename, settings.model])
+        print(f"Processing time inference: {processing_time:.2f} seconds")
+    elif image_url:
+        # test image url: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/invoices/processed/images/invoice_10.jpg
+        with urllib.request.urlopen(image_url) as response:
+            content_type = response.info().get_content_type()
+            if content_type in ["image/jpeg", "image/jpg"]:
+                image = Image.open(BytesIO(response.read()))
+            else:
+                return {"error": "Invalid file type. Only JPG images are allowed."}
+        processing_time = 0
+        if model_in_use == 'donut':
+            result, processing_time = process_document_donut(image)
+        # parse file name from url
+        file_name = image_url.split("/")[-1]
+        utils.log_stats(settings.inference_stats_file, [processing_time, count_values(result), file_name, settings.model])
+        print(f"Processing time inference: {processing_time:.2f} seconds")
+    else:
+        result = {"info": "No input provided"}
+    return result
+@router.get("/statistics")
+async def get_statistics():
+    file_path = settings.inference_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content

training.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from fastapi import APIRouter, Form, BackgroundTasks
+from config import settings
+import os
+import json
+from routers.donut_evaluate import run_evaluate_donut
+from routers.donut_training import run_training_donut
+import utils
+router = APIRouter()
+def invoke_training(max_epochs, val_check_interval, warmup_steps, model_in_use):
+    # if sparrow_key != settings.sparrow_key:
+    #     return {"error": "Invalid Sparrow key."}
+    if model_in_use == 'donut':
+        processing_time = run_training_donut(max_epochs, val_check_interval, warmup_steps)
+        utils.log_stats(settings.training_stats_file, [processing_time, settings.model])
+        print(f"Processing time training: {processing_time:.2f} seconds")
+@router.post("/training")
+async def run_training(background_tasks: BackgroundTasks,
+                       max_epochs: int = Form(30),
+                       val_check_interval: float = Form(0.4),
+                       warmup_steps: int = Form(81),
+                       model_in_use: str = Form('donut')):
+    background_tasks.add_task(invoke_training, max_epochs, val_check_interval, warmup_steps, model_in_use)
+    return {"message": "Dnote Donut ML training started in the background"}
+def invoke_evaluate(model_in_use):
+    # if sparrow_key != settings.sparrow_key:
+    #     return {"error": "Invalid Sparrow key."}
+    if model_in_use == 'donut':
+        scores, accuracy, processing_time = run_evaluate_donut()
+        utils.log_stats(settings.evaluate_stats_file, [processing_time, scores, accuracy, settings.model])
+        print(f"Processing time evaluate: {processing_time:.2f} seconds")
+@router.post("/evaluate")
+async def run_evaluate(background_tasks: BackgroundTasks,
+                       model_in_use: str = Form('donut')):
+    background_tasks.add_task(invoke_evaluate, model_in_use)
+    return {"message": "Dnote Donut ML model evaluation started in the background"}
+@router.get("/statistics/training")
+async def get_statistics_training():
+    file_path = settings.training_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content
+@router.get("/statistics/evaluate")
+async def get_statistics_evaluate():
+    file_path = settings.evaluate_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content