Spaces:
Paused
Paused
File size: 4,271 Bytes
86ffb63 c7898d4 86ffb63 20268d8 0c4a4d0 6587984 86ffb63 904394e c7898d4 904394e c7898d4 904394e c10f4ec 904394e c7898d4 904394e c7898d4 c10f4ec c7898d4 904394e 86ffb63 c10f4ec 86ffb63 6587984 c7898d4 86ffb63 904394e 86ffb63 c7898d4 86ffb63 e7d0dbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# utils.py
import os
from transformers import AutoModel, AutoTokenizer
from PIL import Image, ImageEnhance, ImageFilter
import torch
import logging
from transformers import BertTokenizer
logger = logging.getLogger(__name__)
class OCRModel:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(OCRModel, cls).__new__(cls)
cls._instance.initialize()
return cls._instance
def initialize(self):
try:
logger.info("Initializing OCR model...")
# Try different tokenizer approaches
try:
# First try with the standard approach
self.tokenizer = AutoTokenizer.from_pretrained(
'stepfun-ai/GOT-OCR2_0',
trust_remote_code=True,
use_fast=False
)
except Exception as e:
logger.warning(f"Standard tokenizer failed, trying BertTokenizer: {str(e)}")
# Fall back to BertTokenizer if AutoTokenizer fails
self.tokenizer = BertTokenizer.from_pretrained(
'stepfun-ai/GOT-OCR2_0',
trust_remote_code=True
)
self.model = AutoModel.from_pretrained(
'stepfun-ai/GOT-OCR2_0',
trust_remote_code=True,
low_cpu_mem_usage=True,
device_map='auto',
use_safetensors=True
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.eval().to(self.device)
logger.info("Model initialization completed successfully")
except Exception as e:
logger.error(f"Error initializing model: {str(e)}", exc_info=True)
raise
def preprocess_image(self, image):
"""Image preprocessing to improve text recognition quality"""
try:
# Convert image to RGB if it is not already
if image.mode != 'RGB':
image = image.convert('RGB')
# Improve contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(1.5)
# Improve Sharpness
enhancer = ImageEnhance.Sharpness(image)
image = enhancer.enhance(1.5)
# Improve Brightness
enhancer = ImageEnhance.Brightness(image)
image = enhancer.enhance(1.2)
# Apply a filter to soften the image a little.
image = image.filter(ImageFilter.SMOOTH)
return image
except Exception as e:
logger.error(f"Error in image preprocessing: {str(e)}", exc_info=True)
raise
def process_image(self, image_stream):
try:
logger.info("Starting image processing")
# Save image temporarily because the model requires a file path.
temp_image_path = "temp_image.jpg"
# Reset the start pointer for BytesIO
image_stream.seek(0)
# Open and save the image temporarily.
image = Image.open(image_stream).convert('RGB')
processed_image = self.preprocess_image(image)
processed_image.save(temp_image_path)
# ocr
try:
result = self.model.chat(self.tokenizer, temp_image_path, ocr_type='format')
logger.info(f"Successfully extracted text: {result[:100]}...")
# Delete temporary file
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
return result.strip()
except Exception as e:
logger.error(f"Error in OCR processing: {str(e)}", exc_info=True)
if os.path.exists(temp_image_path):
os.remove(temp_image_path)
raise
except Exception as e:
logger.error(f"Error in image processing: {str(e)}", exc_info=True)
return f"Error processing image: {str(e)}" |