BMukhtar commited on
Commit
e3a55fa
·
1 Parent(s): 29ae74e

new changes

Browse files
models/__pycache__/best_norm_ED.cpython-310.pyc CHANGED
Binary files a/models/__pycache__/best_norm_ED.cpython-310.pyc and b/models/__pycache__/best_norm_ED.cpython-310.pyc differ
 
models/best_norm_ED.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0a93dd748a84d3998efccee420e3cabdf6b1693d3411374e871bcdb8c078169
3
- size 15217067
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87583f4f46b6a5af3782992a4343c950e94967f0c2b0abe62c8f06ff1fefecd9
3
+ size 15237611
models/best_norm_ED.yaml CHANGED
@@ -1,29 +1,30 @@
1
- number: '0123456789'
2
- symbol: "!?.,:;'#()<>+-/*=%$»« "
3
  lang_char: 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
4
- experiment_name: 'kz_synthtiger_v7'
5
- train_data: '../../synthtiger_kz/results/train_v7'
6
- valid_data: '../../synthtiger_kz/results/test_v7'
 
7
  manualSeed: 1111
8
  workers: 6
9
- batch_size: 96 #32
10
- num_iter: 100000
11
- valInterval: 1000
12
- saved_model: '' #'saved_models/en_filtered/iter_300000.pth'
13
  FT: False
14
  optim: False # default is Adadelta
15
- lr: 1.
16
  beta1: 0.9
17
  rho: 0.95
18
  eps: 0.00000001
19
  grad_clip: 5
20
  #Data processing
21
- select_data: 'images' # this is dataset folder in train_data
22
  batch_ratio: '1'
23
  total_data_usage_ratio: 1.0
24
- batch_max_length: 34
25
- imgH: 64
26
- imgW: 600
27
  rgb: False
28
  sensitive: True
29
  PAD: True
@@ -49,4 +50,4 @@ network_params:
49
  hidden_size: 256
50
  lang_list:
51
  - 'en'
52
- character_list: 0123456789!?.,:;'#()<>+-/*=%$»« АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
 
1
+ number: 0123456789
2
+ symbol: $"!#%&'()*+,-./:;<=>?@[\]^_`{|}~«»…£€¥№°
3
  lang_char: 'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
4
+ experiment_name: 'gen_v11_continue'
5
+ train_data: '../../synthtiger_kz/results/train_v12'
6
+ valid_data: '../../synthtiger_kz/results/test_v12/dtgr_v5'
7
+ wb: True
8
  manualSeed: 1111
9
  workers: 6
10
+ batch_size: 128 #32
11
+ num_iter: 200000
12
+ valInterval: 4000
13
+ saved_model: 'saved_models/gen_v11/best_norm_ED.pth'
14
  FT: False
15
  optim: False # default is Adadelta
16
+ lr: 0.5
17
  beta1: 0.9
18
  rho: 0.95
19
  eps: 0.00000001
20
  grad_clip: 5
21
  #Data processing
22
+ select_data: 'dtgr_v5' # this is dataset folder in train_data
23
  batch_ratio: '1'
24
  total_data_usage_ratio: 1.0
25
+ batch_max_length: 40
26
+ imgH: 48
27
+ imgW: 450
28
  rgb: False
29
  sensitive: True
30
  PAD: True
 
50
  hidden_size: 256
51
  lang_list:
52
  - 'en'
53
+ character_list: 0123456789$"!#%&'()*+,-./:;<=>?@[\]^_`{|}~«»…£€¥№° —АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯЁабвгдежзийклмнопрстуфхцчшщъыьэюяёӘҒҚҢӨҰҮІҺәғқңөұүіһABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
test.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import os
4
+ import easyocr
5
+ import numpy as np
6
+ import fitz # PyMuPDF
7
+ import io
8
+ from pdf2image import convert_from_bytes
9
+ #from st_btn_group import st_btn_group
10
+ #from streamlit_option_menu import option_menu
11
+ import docx
12
+ from docx.shared import Pt
13
+ from io import BytesIO
14
+ #import streamlit.components.v1 as components
15
+ import base64
16
+
17
+ #def downloadTxt():
18
+ def generateTxtLink(result):
19
+ result_txt = ""
20
+ print(result)
21
+ for para in result:
22
+ result_txt += para[1]+"\n"
23
+ result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
24
+ result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
25
+ return result_txt_link
26
+
27
+ def generateMultiPageTxtLink(result):
28
+ result_txt = ""
29
+ print(result)
30
+ for para in result:
31
+ result_txt += para+"\n"
32
+ result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
33
+ result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
34
+ return result_txt_link
35
+
36
+ def generateDocLink(result):
37
+ doc = docx.Document()
38
+ for para in result:
39
+ doc.add_paragraph(para[1])
40
+
41
+ target_stream = BytesIO()
42
+ result_doc = doc.save(target_stream)
43
+ base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
44
+ stlyeCss = ""
45
+ doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
46
+ return doc_link
47
+
48
+ def generateMultiPageDocLink(pages_result):
49
+ doc = docx.Document()
50
+ #print(pages_result)
51
+ for page in pages_result:
52
+ page_split = page.split("\n")
53
+ for para in page_split:
54
+ doc.add_paragraph(para)
55
+ doc.add_page_break()
56
+ target_stream = BytesIO()
57
+ result_doc = doc.save(target_stream)
58
+ base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
59
+ doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
60
+ return doc_link
61
+
62
+ def generateButtonGroup(result):
63
+ txtLink = generateTxtLink(result)
64
+ docLink = generateDocLink(result)
65
+ return txtLink+"\n"+docLink
66
+
67
+ def generateButtonGroupForPDF(pages_result):
68
+ #result = "\n\n".join(pages_result)
69
+ txtLink = generateMultiPageTxtLink(pages_result)
70
+ docLink = generateMultiPageDocLink(pages_result)
71
+ return txtLink+"\n"+docLink
72
+
73
+ def local_css(file_name):
74
+ with open(file_name) as f:
75
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
76
+
77
+
78
+ models_dir = "./models"
79
+ output_dir = "./output"
80
+ dirs = [models_dir, output_dir]
81
+ for d in dirs:
82
+ if not os.path.exists(output_dir):
83
+ os.makedirs(output_dir)
84
+
85
+ font_path = models_dir + "/Ubuntu-Regular.ttf"
86
+ reader = easyocr.Reader(
87
+ ['en'],
88
+ gpu=True,
89
+ recog_network='best_norm_ED',
90
+ detect_network="craft",
91
+ user_network_directory=models_dir,
92
+ model_storage_directory=models_dir,
93
+ ) # this needs to run only once to load the model into memory
94
+
95
+
96
+
97
+
98
+ # main title
99
+ st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану")
100
+ local_css("app.css")
101
+ #st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True)
102
+ st.title("Сурет немесе пдф файлдан текст алу")
103
+ # subtitle
104
+ #st.markdown("## Qazaq OCR")
105
+
106
+ uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf'])
107
+
108
+ col1, col2 = st.columns(2)
109
+
110
+ #def process_page(page):
111
+ # image_matrix = fitz.Matrix(fitz.Identity)
112
+ # pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
113
+ # image_data = pixmap.samples# This is a bytes object
114
+ # image = Image.from("RGB",(pixmap.width, pixmap.height),image_data)
115
+ # image = Image.from("RGB", (pixmap.width, pixmap.height), image_data)
116
+ # result = reader.readtext(np.array(image),paragraph=True)
117
+ # return image, result
118
+ import time
119
+
120
+ max_page = 5
121
+ def recognize_page_image(image):
122
+ start = time.time()
123
+ result = [[0,"Sample 1"],[1,"Sample 2"]]
124
+ result = reader.readtext(np.array(image), paragraph=False)
125
+ result = get_paragraph(result)
126
+ end = time.time()
127
+ return result,(end-start)
128
+
129
+
130
+ def process_pdf(uploaded_file):
131
+ pdf_document = fitz.open(temp_pdf_file)
132
+ total_pages = len(pdf_document)
133
+ progress_bar = col2.progress(0, text="Жүктеліп жатыр")
134
+ button_group = col2.container()
135
+ # clear the container
136
+ button_group.empty()
137
+ pages = range(min(max_page,total_pages))
138
+ tabs = col1.tabs([f"Бет {page+1}" for page in pages])
139
+ pages_result = []
140
+ for count, page_num in enumerate(range(min(total_pages,max_page))):
141
+ page = pdf_document.load_page(page_num)
142
+ image_matrix = fitz.Matrix(fitz.Identity)
143
+ pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
144
+ image_data = pixmap.samples # This is a bytes object
145
+ image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data)
146
+ imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10)))
147
+ tabs[count].image(imageSmaller)
148
+ #buffered = BytesIO()
149
+ #imageSmaller.save(buffered,format="JPEG")
150
+ #col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True)
151
+ #col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True)
152
+ #col1.subheader(f'Бет {page_num + 1}/{total_pages}')
153
+ #col1.image(imageSmaller, caption=f'Бет {page_num + 1}')
154
+ result,time_elapsed = recognize_page_image(image)
155
+ expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**')
156
+ expander.write(f'{result[0][1]}')
157
+ result_text = "\n\n".join([item[1] for item in result])
158
+ pages_result.append(result_text)
159
+ #col2.markdown(result_text)
160
+ progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}')
161
+
162
+ button_group_html = generateButtonGroupForPDF(pages_result)
163
+ button_group.write(button_group_html,unsafe_allow_html=True)
164
+ #col1.write("</div>",unsafe_allow_html=True)
165
+ progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')
166
+
167
+ def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
168
+ # create basic attributes
169
+ box_group = []
170
+ for box in raw_result:
171
+ all_x = [int(coord[0]) for coord in box[0]]
172
+ all_y = [int(coord[1]) for coord in box[0]]
173
+ min_x = min(all_x)
174
+ max_x = max(all_x)
175
+ min_y = min(all_y)
176
+ max_y = max(all_y)
177
+ height = max_y - min_y
178
+ box_group.append([box[1], min_x, max_x, min_y, max_y, height, 0.5*(min_y+max_y), 0]) # last element indicates group
179
+ # cluster boxes into paragraph
180
+ current_group = 1
181
+ while len([box for box in box_group if box[7]==0]) > 0:
182
+ box_group0 = [box for box in box_group if box[7]==0] # group0 = non-group
183
+ # new group
184
+ if len([box for box in box_group if box[7]==current_group]) == 0:
185
+ box_group0[0][7] = current_group # assign first box to form new group
186
+ # try to add group
187
+ else:
188
+ current_box_group = [box for box in box_group if box[7]==current_group]
189
+ mean_height = np.mean([box[5] for box in current_box_group])
190
+ min_gx = min([box[1] for box in current_box_group]) - x_ths*mean_height
191
+ max_gx = max([box[2] for box in current_box_group]) + x_ths*mean_height
192
+ min_gy = min([box[3] for box in current_box_group]) - y_ths*mean_height
193
+ max_gy = max([box[4] for box in current_box_group]) + y_ths*mean_height
194
+ add_box = False
195
+ for box in box_group0:
196
+ same_horizontal_level = (min_gx<=box[1]<=max_gx) or (min_gx<=box[2]<=max_gx)
197
+ same_vertical_level = (min_gy<=box[3]<=max_gy) or (min_gy<=box[4]<=max_gy)
198
+ if same_horizontal_level and same_vertical_level:
199
+ box[7] = current_group
200
+ add_box = True
201
+ break
202
+ # cannot add more box, go to next group
203
+ if add_box==False:
204
+ current_group += 1
205
+ # arrage order in paragraph
206
+ result = []
207
+ for i in set(box[7] for box in box_group):
208
+ current_box_group = [box for box in box_group if box[7]==i]
209
+ mean_height = np.mean([box[5] for box in current_box_group])
210
+ min_gx = min([box[1] for box in current_box_group])
211
+ max_gx = max([box[2] for box in current_box_group])
212
+ min_gy = min([box[3] for box in current_box_group])
213
+ max_gy = max([box[4] for box in current_box_group])
214
+
215
+ text = ''
216
+ while len(current_box_group) > 0:
217
+ highest = min([box[6] for box in current_box_group])
218
+ candidates = [box for box in current_box_group if box[6]<highest+0.4*mean_height]
219
+ # get the far left
220
+ if mode == 'ltr':
221
+ most_left = min([box[1] for box in candidates])
222
+ for box in candidates:
223
+ if box[1] == most_left: best_box = box
224
+ elif mode == 'rtl':
225
+ most_right = max([box[2] for box in candidates])
226
+ for box in candidates:
227
+ if box[2] == most_right: best_box = box
228
+ text += ' '+best_box[0]
229
+ current_box_group.remove(best_box)
230
+
231
+ result.append([ [[min_gx,min_gy],[max_gx,min_gy],[max_gx,max_gy],[min_gx,max_gy]], text[1:]])
232
+
233
+ return result
234
+
235
+ if uploaded_file is not None:
236
+ if uploaded_file.type == "application/pdf":
237
+ placeholder = col2.empty()
238
+ with placeholder, st.spinner('PDF өңделуде ...'):
239
+ temp_pdf_file = "./temp_pdf_file.pdf"
240
+ with open(temp_pdf_file, "wb") as f:
241
+ f.write(uploaded_file.read())
242
+ process_pdf(uploaded_file)
243
+ else:
244
+ placeholder = col2.empty()
245
+ with placeholder,st.spinner('Сурет өңделуде ...'):
246
+ image = Image.open(uploaded_file)
247
+ #with open(os.path.join("tempDir",image_file))
248
+ col1.image(image)
249
+ result = reader.readtext(np.array(image), paragraph=True)
250
+ result_text = "\n\n".join([item[1] for item in result])
251
+ button_group_html = generateButtonGroup(result)
252
+ col2.write(button_group_html, unsafe_allow_html=True)
253
+ col2.markdown(result_text)