Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

File size: 5,357 Bytes

import os
import warnings
import torch
import gc
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel
from PIL import Image
import gradio as gr
from huggingface_hub import login

# Basic settings
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Global variables
model = None
processor = None

# Login to Hugging Face Hub
if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
   print("กำลังเข้าสู่ระบบ Hugging Face Hub...")
   login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
else:
   print("คำเตือน: ไม่พบ HUGGING_FACE_HUB_TOKEN")

def load_model_and_processor():
   """โหลดโมเดลและ processor"""
   global model, processor
   print("กำลังโหลดโมเดลและ processor...")
   try:
       # Model paths
       base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
       adapter_path = "Aekanun/thai-handwriting-llm"

       # Load processor from base model
       print("กำลังโหลด processor...")
       processor = AutoProcessor.from_pretrained(
           base_model_path, 
           use_auth_token=True,
           low_memory=True  # เพิ่ม low memory option
       )

       # Load base model with CPU configurations
       print("กำลังโหลด base model...")
       base_model = AutoModelForVision2Seq.from_pretrained(
           base_model_path,
           device_map={"": "cpu"},
           torch_dtype=torch.float32,
           trust_remote_code=True,
           use_auth_token=True,
           low_cpu_mem_usage=True,  # เพิ่ม low memory usage
           offload_folder="offload"  # เพิ่ม offload folder
       )

       # Load adapter with CPU configurations
       print("กำลังโหลด adapter...")
       model = PeftModel.from_pretrained(
           base_model,
           adapter_path,
           torch_dtype=torch.float32,
           device_map={"": "cpu"},
           use_auth_token=True,
           low_cpu_mem_usage=True  # เพิ่ม low memory usage
       )

       # Clear memory
       gc.collect()
       torch.cuda.empty_cache() if torch.cuda.is_available() else None
       
       print("โหลดโมเดลสำเร็จ!")
       return True
   except Exception as e:
       print(f"เกิดข้อผิดพลาดในการโหลดโมเดล: {str(e)}")
       return False

def process_handwriting(image):
   """ฟังก์ชันสำหรับ Gradio interface"""
   global model, processor
   
   if image is None:
       return "กรุณาอัพโหลดรูปภาพ"
   
   try:
       # Ensure image is in PIL format
       if not isinstance(image, Image.Image):
           image = Image.fromarray(image)
       
       # Create prompt
       prompt = """Transcribe the Thai handwritten text from the provided image.
Only return the transcription in Thai language."""
       
       # Create model inputs
       messages = [
           {
               "role": "user",
               "content": [
                   {"type": "text", "text": prompt},
                   {"type": "image", "image": image}
               ],
           }
       ]
       
       # Process with model
       text = processor.apply_chat_template(messages, tokenize=False)
       inputs = processor(text=text, images=image, return_tensors="pt")
       inputs = {k: v.to('cpu') for k, v in inputs.items()}
       
       # Generate with memory optimization
       with torch.no_grad():
           outputs = model.generate(
               **inputs,
               max_new_tokens=256,
               do_sample=False,
               pad_token_id=processor.tokenizer.pad_token_id,
               use_cache=True  # ใช้ cache เพื่อประหยัด memory
           )
       
       # Clear memory after generation
       gc.collect()
       
       # Decode output
       transcription = processor.decode(outputs[0], skip_special_tokens=True)
       return transcription.strip()
   except Exception as e:
       return f"เกิดข้อผิดพลาด: {str(e)}"

# Initialize application
print("กำลังเริ่มต้นแอปพลิเคชัน...")
if load_model_and_processor():
   # Create Gradio interface with lower memory usage
   demo = gr.Interface(
       fn=process_handwriting,
       inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
       outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
       title="Thai Handwriting Recognition",
       description="อัพโหลดรูปภาพลายมือเขียนภาษาไทยเพื่อแปลงเป็นข้อความ",
       examples=[["example1.jpg"], ["example2.jpg"]],
       cache_examples=False  # ไม่ cache examples เพื่อประหยัด memory
   )
   
   if __name__ == "__main__":
       demo.launch(
           share=False,  # ไม่แชร์ public URL
           show_error=True  # แสดง error messages
       )
else:
   print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")