paveleremin docparser commited on
Commit
d07d3c4
·
0 Parent(s):

Duplicate from docparser/Text_Captcha_breaker

Browse files

Co-authored-by: docparserai <[email protected]>

Files changed (24) hide show
  1. .gitattributes +35 -0
  2. 000679.png +0 -0
  3. 000HU.png +0 -0
  4. 00Uga.png.jpg +0 -0
  5. 00bAQwhAZU.jpg +0 -0
  6. 00h57kYf.jpg +0 -0
  7. 0EoHdtVb.png +0 -0
  8. 0JS21.png +0 -0
  9. 0p98z.png +0 -0
  10. 10010.png +0 -0
  11. 1014.jpg +0 -0
  12. 1017.png +0 -0
  13. 11JW29.png +0 -0
  14. 2A5Z.png +0 -0
  15. 2a2p4.png +0 -0
  16. 2a8486.jpg +0 -0
  17. 2cxfr.png +0 -0
  18. 2nbcx.png +0 -0
  19. 8000.png +0 -0
  20. ACKIO.png +0 -0
  21. README.md +14 -0
  22. app.py +77 -0
  23. requirements.txt +5 -0
  24. tokenizer_base.py +132 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
000679.png ADDED
000HU.png ADDED
00Uga.png.jpg ADDED
00bAQwhAZU.jpg ADDED
00h57kYf.jpg ADDED
0EoHdtVb.png ADDED
0JS21.png ADDED
0p98z.png ADDED
10010.png ADDED
1014.jpg ADDED
1017.png ADDED
11JW29.png ADDED
2A5Z.png ADDED
2a2p4.png ADDED
2a8486.jpg ADDED
2cxfr.png ADDED
2nbcx.png ADDED
8000.png ADDED
ACKIO.png ADDED
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Text Captcha Breaker
3
+ emoji: 🏃
4
+ colorFrom: indigo
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.37.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: docparser/Text_Captcha_breaker
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import onnx
3
+ import onnxruntime as rt
4
+ from torchvision import transforms as T
5
+ from PIL import Image
6
+ from tokenizer_base import Tokenizer
7
+ import pathlib
8
+ import os
9
+ import gradio as gr
10
+ from huggingface_hub import Repository
11
+
12
+ repo = Repository(
13
+ local_dir="secret_models",
14
+ repo_type="model",
15
+ clone_from="docparser/captcha",
16
+ token=True
17
+ )
18
+ repo.git_pull()
19
+
20
+ cwd = pathlib.Path(__file__).parent.resolve()
21
+ model_file = os.path.join(cwd,"secret_models","captcha.onnx")
22
+ img_size = (32,128)
23
+ charset = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
24
+ tokenizer_base = Tokenizer(charset)
25
+
26
+ def get_transform(img_size):
27
+ transforms = []
28
+ transforms.extend([
29
+ T.Resize(img_size, T.InterpolationMode.BICUBIC),
30
+ T.ToTensor(),
31
+ T.Normalize(0.5, 0.5)
32
+ ])
33
+ return T.Compose(transforms)
34
+
35
+ def to_numpy(tensor):
36
+ return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
37
+
38
+ def initialize_model(model_file):
39
+ transform = get_transform(img_size)
40
+ # Onnx model loading
41
+ onnx_model = onnx.load(model_file)
42
+ onnx.checker.check_model(onnx_model)
43
+ ort_session = rt.InferenceSession(model_file)
44
+ return transform,ort_session
45
+
46
+ def get_text(img_org):
47
+ # img_org = Image.open(image_path)
48
+ # Preprocess. Model expects a batch of images with shape: (B, C, H, W)
49
+ x = transform(img_org.convert('RGB')).unsqueeze(0)
50
+
51
+ # compute ONNX Runtime output prediction
52
+ ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
53
+ logits = ort_session.run(None, ort_inputs)[0]
54
+ probs = torch.tensor(logits).softmax(-1)
55
+ preds, probs = tokenizer_base.decode(probs)
56
+ preds = preds[0]
57
+ print(preds)
58
+ return preds
59
+
60
+ transform,ort_session = initialize_model(model_file=model_file)
61
+
62
+ gr.Interface(
63
+ get_text,
64
+ inputs=gr.Image(type="pil"),
65
+ outputs=gr.outputs.Textbox(),
66
+ title="Text Captcha Reader",
67
+ examples=["8000.png","11JW29.png","2a8486.jpg","2nbcx.png",
68
+ "000679.png","000HU.png","00Uga.png.jpg","00bAQwhAZU.jpg",
69
+ "00h57kYf.jpg","0EoHdtVb.png","0JS21.png","0p98z.png","10010.png"]
70
+ ).launch()
71
+
72
+ # if __name__ == "__main__":
73
+ # image_path = "8000.png"
74
+ # preds,probs = get_text(image_path)
75
+ # print(preds[0])
76
+
77
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch==1.11.0
2
+ torchvision==0.12.0
3
+ onnx==1.14.0
4
+ onnxruntime==1.15.1
5
+ Pillow==10.0.0
tokenizer_base.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+ from itertools import groupby
4
+ from typing import List, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import Tensor
8
+ from torch.nn.utils.rnn import pad_sequence
9
+
10
+
11
+ class CharsetAdapter:
12
+ """Transforms labels according to the target charset."""
13
+
14
+ def __init__(self, target_charset) -> None:
15
+ super().__init__()
16
+ self.charset = target_charset ###
17
+ self.lowercase_only = target_charset == target_charset.lower()
18
+ self.uppercase_only = target_charset == target_charset.upper()
19
+ # self.unsupported = f'[^{re.escape(target_charset)}]'
20
+
21
+ def __call__(self, label):
22
+ if self.lowercase_only:
23
+ label = label.lower()
24
+ elif self.uppercase_only:
25
+ label = label.upper()
26
+ return label
27
+
28
+
29
+ class BaseTokenizer(ABC):
30
+
31
+ def __init__(self, charset: str, specials_first: tuple = (), specials_last: tuple = ()) -> None:
32
+ self._itos = specials_first + tuple(charset+'[UNK]') + specials_last
33
+ self._stoi = {s: i for i, s in enumerate(self._itos)}
34
+
35
+ def __len__(self):
36
+ return len(self._itos)
37
+
38
+ def _tok2ids(self, tokens: str) -> List[int]:
39
+ return [self._stoi[s] for s in tokens]
40
+
41
+ def _ids2tok(self, token_ids: List[int], join: bool = True) -> str:
42
+ tokens = [self._itos[i] for i in token_ids]
43
+ return ''.join(tokens) if join else tokens
44
+
45
+ @abstractmethod
46
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
47
+ """Encode a batch of labels to a representation suitable for the model.
48
+
49
+ Args:
50
+ labels: List of labels. Each can be of arbitrary length.
51
+ device: Create tensor on this device.
52
+
53
+ Returns:
54
+ Batched tensor representation padded to the max label length. Shape: N, L
55
+ """
56
+ raise NotImplementedError
57
+
58
+ @abstractmethod
59
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
60
+ """Internal method which performs the necessary filtering prior to decoding."""
61
+ raise NotImplementedError
62
+
63
+ def decode(self, token_dists: Tensor, raw: bool = False) -> Tuple[List[str], List[Tensor]]:
64
+ """Decode a batch of token distributions.
65
+
66
+ Args:
67
+ token_dists: softmax probabilities over the token distribution. Shape: N, L, C
68
+ raw: return unprocessed labels (will return list of list of strings)
69
+
70
+ Returns:
71
+ list of string labels (arbitrary length) and
72
+ their corresponding sequence probabilities as a list of Tensors
73
+ """
74
+ batch_tokens = []
75
+ batch_probs = []
76
+ for dist in token_dists:
77
+ probs, ids = dist.max(-1) # greedy selection
78
+ if not raw:
79
+ probs, ids = self._filter(probs, ids)
80
+ tokens = self._ids2tok(ids, not raw)
81
+ batch_tokens.append(tokens)
82
+ batch_probs.append(probs)
83
+ return batch_tokens, batch_probs
84
+
85
+
86
+ class Tokenizer(BaseTokenizer):
87
+ BOS = '[B]'
88
+ EOS = '[E]'
89
+ PAD = '[P]'
90
+
91
+ def __init__(self, charset: str) -> None:
92
+ specials_first = (self.EOS,)
93
+ specials_last = (self.BOS, self.PAD)
94
+ super().__init__(charset, specials_first, specials_last)
95
+ self.eos_id, self.bos_id, self.pad_id = [self._stoi[s] for s in specials_first + specials_last]
96
+
97
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
98
+ batch = [torch.as_tensor([self.bos_id] + self._tok2ids(y) + [self.eos_id], dtype=torch.long, device=device)
99
+ for y in labels]
100
+ return pad_sequence(batch, batch_first=True, padding_value=self.pad_id)
101
+
102
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
103
+ ids = ids.tolist()
104
+ try:
105
+ eos_idx = ids.index(self.eos_id)
106
+ except ValueError:
107
+ eos_idx = len(ids) # Nothing to truncate.
108
+ # Truncate after EOS
109
+ ids = ids[:eos_idx]
110
+ probs = probs[:eos_idx + 1] # but include prob. for EOS (if it exists)
111
+ return probs, ids
112
+
113
+
114
+ class CTCTokenizer(BaseTokenizer):
115
+ BLANK = '[B]'
116
+
117
+ def __init__(self, charset: str) -> None:
118
+ # BLANK uses index == 0 by default
119
+ super().__init__(charset, specials_first=(self.BLANK,))
120
+ self.blank_id = self._stoi[self.BLANK]
121
+
122
+ def encode(self, labels: List[str], device: Optional[torch.device] = None) -> Tensor:
123
+ # We use a padded representation since we don't want to use CUDNN's CTC implementation
124
+ batch = [torch.as_tensor(self._tok2ids(y), dtype=torch.long, device=device) for y in labels]
125
+ return pad_sequence(batch, batch_first=True, padding_value=self.blank_id)
126
+
127
+ def _filter(self, probs: Tensor, ids: Tensor) -> Tuple[Tensor, List[int]]:
128
+ # Best path decoding:
129
+ ids = list(zip(*groupby(ids.tolist())))[0] # Remove duplicate tokens
130
+ ids = [x for x in ids if x != self.blank_id] # Remove BLANKs
131
+ # `probs` is just pass-through since all positions are considered part of the path
132
+ return probs, ids