syubraj commited on
Commit
134de30
1 Parent(s): f7cf73e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, TrOCRProcessor,AutoTokenizer,ViTFeatureExtractor
3
+ from PIL import Image
4
+ import torch
5
+
6
+ def preprocess_image(image):
7
+ # Resize while maintaining aspect ratio
8
+ target_size = (224, 224)
9
+ original_size = image.size
10
+
11
+ # Calculate the new size while maintaining aspect ratio
12
+ aspect_ratio = original_size[0] / original_size[1]
13
+ if aspect_ratio > 1: # Width is greater than height
14
+ new_width = target_size[0]
15
+ new_height = int(target_size[0] / aspect_ratio)
16
+ else: # Height is greater than width
17
+ new_height = target_size[1]
18
+ new_width = int(target_size[1] * aspect_ratio)
19
+
20
+ # Resize the image
21
+ resized_img = image.resize((new_width, new_height))
22
+
23
+ # Calculate padding values
24
+ padding_width = target_size[0] - new_width
25
+ padding_height = target_size[1] - new_height
26
+
27
+ # Apply padding to center the resized image
28
+ pad_left = padding_width // 2
29
+ pad_top = padding_height // 2
30
+ pad_image = Image.new('RGB', target_size, (255, 255, 255)) # White background
31
+ pad_image.paste(resized_img, (pad_left, pad_top))
32
+ return pad_image
33
+
34
+
35
+ # Load model directly
36
+ from transformers import AutoTokenizer, AutoModel,ViTFeatureExtractor,TrOCRProcessor,VisionEncoderDecoderModel
37
+
38
+ tokenizer = AutoTokenizer.from_pretrained("syubraj/TrOCR_Nepali")
39
+ model1 = VisionEncoderDecoderModel.from_pretrained("syubraj/TrOCR_Nepali")
40
+ feature_extractor1 = ViTFeatureExtractor.from_pretrained("syubraj/TrOCR_Nepali")
41
+
42
+ processor1 = TrOCRProcessor(feature_extractor=feature_extractor1, tokenizer=tokenizer)
43
+
44
+
45
+ # tokenizer = AutoTokenizer.from_pretrained("paudelanil/trocr-devanagari")
46
+ # model = VisionEncoderDecoderModel.from_pretrained("paudelanil/trocr-devanagari")
47
+ # feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
48
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
49
+
50
+ model1.to(device)
51
+ def predict(image):
52
+ # Preprocess the image
53
+ image = Image.open(image).convert("RGB")
54
+ image = preprocess_image(image)
55
+ pixel_values = processor1(image, return_tensors="pt").pixel_values.to(device)
56
+
57
+ # Generate text from the image
58
+ generated_ids = model1.generate(pixel_values)
59
+ generated_text = processor1.batch_decode(generated_ids, skip_special_tokens=True)[0]
60
+
61
+ return generated_text
62
+
63
+ # Create the Gradio interface
64
+ interface = gr.Interface(
65
+ fn=predict,
66
+ inputs=gr.Image(type="filepath"),
67
+ outputs="text",
68
+ title="Devanagari OCR with TrOCR",
69
+ description="Upload an image with Devanagari script and get the text prediction using a pre-trained Vision-Text model."
70
+ )
71
+
72
+ # Launch the interface
73
+ interface.launch(share=True)