Upload textnet models

Browse files

Files changed (4) hide show

README.md +56 -0
config.json +236 -0
model.safetensors +3 -0
preprocessor_config.json +28 -0

README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+---
+library_name: transformers
+---
+## TextNet-T/S/B: Efficient Text Detection Models
+### **Overview**
+TextNet is a lightweight and efficient architecture designed specifically for text detection, offering superior performance compared to traditional models like MobileNetV3. With variants **TextNet-T**, **TextNet-S**, and **TextNet-B** (6.8M, 8.0M, and 8.9M parameters respectively), it achieves an excellent balance between accuracy and inference speed.
+### **Performance**
+TextNet achieves state-of-the-art results in text detection, outperforming hand-crafted models in both accuracy and speed. Its architecture is highly efficient, making it ideal for GPU-based applications.
+### How to use
+### Transformers
+```bash
+pip install transformers
+```
+```python
+import torch
+import requests
+from PIL import Image
+from transformers import AutoImageProcessor, AutoBackbone
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+processor = AutoImageProcessor.from_pretrained("jadechoghari/textnet-base")
+model = AutoBackbone.from_pretrained("jadechoghari/textnet-base")
+inputs = processor(image, return_tensors="pt")
+with torch.no_grad():
+  outputs = model(**inputs)
+```
+### **Training**
+We first compare TextNet with representative hand-crafted backbones,
+such as ResNets and VGG16. For a fair comparison,
+all models are first pre-trained on IC17-MLT [52] and then
+finetuned on Total-Text. The proposed
+TextNet models achieve a better trade-off between accuracy
+and inference speed than previous hand-crafted models by a
+significant margin. In addition, notably, our TextNet-T, -S, and
+-B only have 6.8M, 8.0M, and 8.9M parameters respectively,
+which are more parameter-efficient than ResNets and VGG16.
+These results demonstrate that TextNet models are effective for
+text detection on the GPU device.
+### **Applications**
+Perfect for real-world text detection tasks, including:
+- Natural scene text recognition
+- Multi-lingual and multi-oriented text detection
+- Document text region analysis
+### **Contribution**
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan),
+[jadechoghari](https://huggingface.co/jadechoghari)
+and [nielsr](https://huggingface.co/nielsr).

config.json ADDED Viewed

	@@ -0,0 +1,236 @@

+{
+  "architectures": [
+    "TextNetBackbone"
+  ],
+  "batch_norm_eps": 1e-05,
+  "conv_layer_kernel_sizes": [
+    [
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        1,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ]
+    ],
+    [
+      [
+        3,
+        3
+      ],
+      [
+        1,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ]
+    ],
+    [
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        1,
+        3
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        3
+      ],
+      [
+        3,
+        1
+      ]
+    ],
+    [
+      [
+        3,
+        3
+      ],
+      [
+        1,
+        3
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        3,
+        1
+      ],
+      [
+        1,
+        3
+      ]
+    ]
+  ],
+  "conv_layer_strides": [
+    [
+      1,
+      2,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1
+    ],
+    [
+      2,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1
+    ],
+    [
+      2,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1,
+      1
+    ],
+    [
+      2,
+      1,
+      1,
+      1,
+      1
+    ]
+  ],
+  "depths": [
+    10,
+    10,
+    8,
+    5
+  ],
+  "hidden_sizes": [
+    64,
+    64,
+    128,
+    256,
+    512
+  ],
+  "image_size": [
+    640,
+    640
+  ],
+  "initializer_range": 0.02,
+  "model_type": "textnet",
+  "out_features": [
+    "stage1",
+    "stage2",
+    "stage3",
+    "stage4"
+  ],
+  "out_indices": [
+    1,
+    2,
+    3,
+    4
+  ],
+  "stage_names": [
+    "stem",
+    "stage1",
+    "stage2",
+    "stage3",
+    "stage4"
+  ],
+  "stem_act_func": "relu",
+  "stem_kernel_size": 3,
+  "stem_num_channels": 3,
+  "stem_out_channels": 64,
+  "stem_stride": 2,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.0.dev0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d414e7a89a7709dbc14de450ad52dadc9796ff40b9b74540066132a4410fe724
+size 54291592

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "TextNetImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 640
+  },
+  "size_divisor": 32
+}