AI-SV
/

document-ocr-ai

Model card Files Files and versions Community

File size: 5,215 Bytes

74f6a97

import os
import json
import tempfile
from fastapi import FastAPI, UploadFile, File, HTTPException
from paddleocr import PPStructure
import logging
import paddle

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI()

# Global variable for OCR engine
ocr_engine = None

# Function to initialize the PaddleOCR engine based on GPU availability
def init_ocr_engine():
    global ocr_engine
    if ocr_engine is None:
        use_gpu = is_gpu_available()
        
        if use_gpu:
            logger.info("NVIDIA GPU detected, running PaddleOCR on GPU.")
        else:
            logger.info("No GPU detected, running PaddleOCR on CPU.")
            
        # Initialize the OCR engine with the use_gpu variable
        ocr_engine = PPStructure(
            table=True, 
            ocr=True, 
            show_log=True,
            layout_score_threshold=0.1,
            structure_version='PP-StructureV2',
            use_gpu=use_gpu
        )
    
    return ocr_engine

# Function to check for GPU availability using Paddle
def is_gpu_available():
    # Check if PaddlePaddle is compiled with CUDA and if a CUDA device is available
    return paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0

# Function to perform OCR and save the structured result
def perform_ocr_and_save(pdf_path, save_folder='./output'):
    # Initialize PaddleOCR engine
    ocr_engine = init_ocr_engine()
    # Directly pass the PDF to PaddleOCR
    result = ocr_engine(pdf_path)
    
    if not result:
        logger.error(f"OCR failed for {pdf_path}")
    
    return result

# Function to format results to strings and sort them
def format_to_strings_and_sort(results):
    logger.info("Formatting and sorting OCR results.")
    formatted_data = []

    for idx, elements in enumerate(results):
        for element in elements:
            type = element['type']
            bbox = element['bbox']
            responses = element['res']

            if type != 'table':
                for response in responses:
                    y_coordinate = bbox[1]  # Use y1 coordinate for sorting
                    formatted_data.append({
                        'page_num': idx + 1,
                        'type': type,
                        'text': response['text'],
                        'confidence': response['confidence'],
                        'bbox': bbox,
                        'y_coordinate': y_coordinate  # Add y-coordinate for sorting
                    })
            else:
                formatted_data.append({
                    'page_num': idx + 1,
                    'type': type,
                    'html': responses['html'],
                    'bbox': bbox,
                    'y_coordinate': bbox[1]  # Use bbox y1 for sorting
                })

    sorted_data = sorted(formatted_data, key=lambda x: (x['page_num'], x['y_coordinate']))

    logger.info("Sorting completed.")
    return sorted_data

# Function to save results to a JSON file
def save_to_json(data, filename):
    logger.info(f"Saving sorted results to {filename}.")
    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

# FastAPI endpoint to process uploaded PDF
@app.post("/process-ocr/")
async def process_ocr(file: UploadFile = File(...)):
    try:
        # Validate file type
        if file.content_type != "application/pdf":
            logger.warning(f"Invalid file type uploaded: {file.content_type}")
            raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF file.")

        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            contents = await file.read()
            temp_file.write(contents)
            temp_file_path = temp_file.name
            logger.info(f"Temporary file created at: {temp_file_path}")

        # Perform OCR and save results
        result = perform_ocr_and_save(temp_file_path)
        
        if result is None:
            raise HTTPException(status_code=500, detail="OCR processing failed. Check the input file.")

        # Sort and format the results
        result_json = format_to_strings_and_sort(result)

        # Optionally, save the result JSON to a file (for debugging)
        save_to_json(result_json, 'result_json.json')

        # Return sorted result as JSON
        return result_json

    except Exception as e:
        logger.error(f"An error occurred during OCR processing: {e}")
        raise HTTPException(status_code=500, detail="An error occurred during OCR processing.")

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)
            logger.info(f"Temporary file {temp_file_path} deleted.")

# Endpoint to check if GPU is available
@app.get("/check-gpu/")
def check_gpu():
    if is_gpu_available():
        return {"gpu_available": True, "message": "NVIDIA GPU is available and will be used."}
    else:
        return {"gpu_available": False, "message": "NVIDIA GPU is not available, using CPU instead."}