File size: 5,215 Bytes
74f6a97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import json
import tempfile
from fastapi import FastAPI, UploadFile, File, HTTPException
from paddleocr import PPStructure
import logging
import paddle

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI()

# Global variable for OCR engine
ocr_engine = None

# Function to initialize the PaddleOCR engine based on GPU availability
def init_ocr_engine():
    global ocr_engine
    if ocr_engine is None:
        use_gpu = is_gpu_available()
        
        if use_gpu:
            logger.info("NVIDIA GPU detected, running PaddleOCR on GPU.")
        else:
            logger.info("No GPU detected, running PaddleOCR on CPU.")
            
        # Initialize the OCR engine with the use_gpu variable
        ocr_engine = PPStructure(
            table=True, 
            ocr=True, 
            show_log=True,
            layout_score_threshold=0.1,
            structure_version='PP-StructureV2',
            use_gpu=use_gpu
        )
    
    return ocr_engine

# Function to check for GPU availability using Paddle
def is_gpu_available():
    # Check if PaddlePaddle is compiled with CUDA and if a CUDA device is available
    return paddle.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0

# Function to perform OCR and save the structured result
def perform_ocr_and_save(pdf_path, save_folder='./output'):
    # Initialize PaddleOCR engine
    ocr_engine = init_ocr_engine()
    # Directly pass the PDF to PaddleOCR
    result = ocr_engine(pdf_path)
    
    if not result:
        logger.error(f"OCR failed for {pdf_path}")
    
    return result

# Function to format results to strings and sort them
def format_to_strings_and_sort(results):
    logger.info("Formatting and sorting OCR results.")
    formatted_data = []

    for idx, elements in enumerate(results):
        for element in elements:
            type = element['type']
            bbox = element['bbox']
            responses = element['res']

            if type != 'table':
                for response in responses:
                    y_coordinate = bbox[1]  # Use y1 coordinate for sorting
                    formatted_data.append({
                        'page_num': idx + 1,
                        'type': type,
                        'text': response['text'],
                        'confidence': response['confidence'],
                        'bbox': bbox,
                        'y_coordinate': y_coordinate  # Add y-coordinate for sorting
                    })
            else:
                formatted_data.append({
                    'page_num': idx + 1,
                    'type': type,
                    'html': responses['html'],
                    'bbox': bbox,
                    'y_coordinate': bbox[1]  # Use bbox y1 for sorting
                })

    sorted_data = sorted(formatted_data, key=lambda x: (x['page_num'], x['y_coordinate']))

    logger.info("Sorting completed.")
    return sorted_data

# Function to save results to a JSON file
def save_to_json(data, filename):
    logger.info(f"Saving sorted results to {filename}.")
    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

# FastAPI endpoint to process uploaded PDF
@app.post("/process-ocr/")
async def process_ocr(file: UploadFile = File(...)):
    try:
        # Validate file type
        if file.content_type != "application/pdf":
            logger.warning(f"Invalid file type uploaded: {file.content_type}")
            raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF file.")

        # Create a temporary file to store the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            contents = await file.read()
            temp_file.write(contents)
            temp_file_path = temp_file.name
            logger.info(f"Temporary file created at: {temp_file_path}")

        # Perform OCR and save results
        result = perform_ocr_and_save(temp_file_path)
        
        if result is None:
            raise HTTPException(status_code=500, detail="OCR processing failed. Check the input file.")

        # Sort and format the results
        result_json = format_to_strings_and_sort(result)

        # Optionally, save the result JSON to a file (for debugging)
        save_to_json(result_json, 'result_json.json')

        # Return sorted result as JSON
        return result_json

    except Exception as e:
        logger.error(f"An error occurred during OCR processing: {e}")
        raise HTTPException(status_code=500, detail="An error occurred during OCR processing.")

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)
            logger.info(f"Temporary file {temp_file_path} deleted.")

# Endpoint to check if GPU is available
@app.get("/check-gpu/")
def check_gpu():
    if is_gpu_available():
        return {"gpu_available": True, "message": "NVIDIA GPU is available and will be used."}
    else:
        return {"gpu_available": False, "message": "NVIDIA GPU is not available, using CPU instead."}