Spaces:
Sleeping
Sleeping
File size: 11,465 Bytes
8c12665 e5df2fe 8c12665 aebb77d 8c12665 29ae74e 8c12665 2d38cb5 8c12665 2d38cb5 29ae74e 8c12665 fb5ed1a 8c12665 e5df2fe 8c12665 2d38cb5 29ae74e e5df2fe 29ae74e fb5ed1a 29ae74e 2d38cb5 29ae74e 8c12665 2d38cb5 8c12665 e5df2fe 29ae74e e5df2fe 8c12665 2d38cb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
import streamlit as st
from PIL import Image
import os
import easyocr
import numpy as np
import fitz # PyMuPDF
import io
from pdf2image import convert_from_bytes
#from st_btn_group import st_btn_group
#from streamlit_option_menu import option_menu
import docx
from io import BytesIO
#import streamlit.components.v1 as components
import base64
line_separator = "\n\n"
#def downloadTxt():
def generateTxtLink(result):
result_txt = ""
print(result)
for para in result:
result_txt += para[1]+"\n"
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
return result_txt_link
def generateMultiPageTxtLink(result):
result_txt = ""
print(result)
for para in result:
result_txt += para+"\n"
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
return result_txt_link
def generateDocLink(result):
doc = docx.Document()
for para in result:
doc.add_paragraph(para[1])
target_stream = BytesIO()
result_doc = doc.save(target_stream)
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
stlyeCss = ""
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
return doc_link
def generateMultiPageDocLink(pages_result):
doc = docx.Document()
#print(pages_result)
for page in pages_result:
page_split = page.split("\n")
for para in page_split:
doc.add_paragraph(para)
doc.add_page_break()
target_stream = BytesIO()
result_doc = doc.save(target_stream)
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
return doc_link
def generateButtonGroup(result):
txtLink = generateTxtLink(result)
docLink = generateDocLink(result)
return txtLink+"\n"+docLink
def generateButtonGroupForPDF(pages_result):
txtLink = generateMultiPageTxtLink(pages_result)
docLink = generateMultiPageDocLink(pages_result)
return txtLink+"\n"+docLink
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
models_dir = "./models"
output_dir = "./output"
dirs = [models_dir, output_dir]
for d in dirs:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
font_path = models_dir + "/Ubuntu-Regular.ttf"
reader = easyocr.Reader(
['en'],
gpu=True,
recog_network='best_norm_ED',
detect_network="craft",
user_network_directory=models_dir,
model_storage_directory=models_dir,
) # this needs to run only once to load the model into memory
# main title
st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану")
local_css("app.css")
#st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True)
st.title("Сурет немесе пдф файлдан текст алу")
# subtitle
#st.markdown("## Qazaq OCR")
uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf'])
col1, col2 = st.columns(2)
import time
max_page = 5
def recognize_page_image(image):
start = time.time()
result = [[0,"Sample 1"],[1,"Sample 2"]]
result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
result = get_paragraph(result)
end = time.time()
return result,(end-start)
def process_pdf(uploaded_file):
pdf_document = fitz.open(temp_pdf_file)
total_pages = len(pdf_document)
progress_bar = col2.progress(0, text="Жүктеліп жатыр")
button_group = col2.container()
# clear the container
button_group.empty()
pages = range(min(max_page,total_pages))
tabs = col1.tabs([f"Бет {page+1}" for page in pages])
pages_result = []
for count, page_num in enumerate(range(min(total_pages,max_page))):
page = pdf_document.load_page(page_num)
image_matrix = fitz.Matrix(fitz.Identity)
pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
image_data = pixmap.samples # This is a bytes object
image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data)
imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10)))
tabs[count].image(imageSmaller)
#buffered = BytesIO()
#imageSmaller.save(buffered,format="JPEG")
#col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True)
#col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True)
#col1.subheader(f'Бет {page_num + 1}/{total_pages}')
#col1.image(imageSmaller, caption=f'Бет {page_num + 1}')
result,time_elapsed = recognize_page_image(image)
expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**')
expander.write(f'{result[0][1]}')
result_text = line_separator.join([item[1] for item in result])
pages_result.append(result_text)
#col2.markdown(result_text)
progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}')
button_group_html = generateButtonGroupForPDF(pages_result)
button_group.write(button_group_html,unsafe_allow_html=True)
#col1.write("</div>",unsafe_allow_html=True)
progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')
class TextBox:
def __init__(self, text, coordinates):
# order: topLeft, bottomLeft, bottomRight, topRight
x_coords = [int(coord[0]) for coord in coordinates]
y_coords = [int(coord[1]) for coord in coordinates]
self.text = text
self.min_x = min(x_coords)
self.max_x = max(x_coords)
self.min_y = min(y_coords)
self.max_y = max(y_coords)
self.height = self.max_y - self.min_y
self.center_y = 0.5 * (self.min_y + self.max_y)
self.group_id = 0 # Initially ungrouped
def __repr__(self):
return f"TextBox(text={self.text}, group_id={self.group_id})"
def get_paragraph(ocr_results, horizontal_threshold=1, vertical_threshold=0.0, reading_mode='ltr'):
# Convert raw OCR results into TextBox objects
text_boxes = [TextBox(box[1], box[0]) for box in ocr_results]
# Group the boxes into paragraphs
current_group_id = 1
while any(box.group_id == 0 for box in text_boxes): # While there are ungrouped boxes
ungrouped_boxes = [box for box in text_boxes if box.group_id == 0]
# Start a new group if none exists for the current group_id
if all(box.group_id != current_group_id for box in text_boxes):
ungrouped_boxes[0].group_id = current_group_id # Assign the first ungrouped box to the new group
else:
# Try to add boxes to the current group
current_group_boxes = [box for box in text_boxes if box.group_id == current_group_id]
average_height = np.mean([box.height for box in current_group_boxes])
added_to_group = False
for group_box in current_group_boxes:
min_group_x = group_box.min_x - horizontal_threshold * average_height
max_group_x = group_box.max_x + horizontal_threshold * average_height
min_group_y = group_box.min_y - vertical_threshold * average_height
max_group_y = group_box.max_y + vertical_threshold * average_height
for ungrouped_box in ungrouped_boxes:
horizontally_aligned = (min_group_x <= ungrouped_box.min_x <= max_group_x) or (min_group_x <= ungrouped_box.max_x <= max_group_x)
vertically_aligned = (min_group_y <= ungrouped_box.center_y <= max_group_y)
if horizontally_aligned and vertically_aligned:
ungrouped_box.group_id = current_group_id
added_to_group = True
break
# If no box was added to the current group, move to the next group
if not added_to_group:
current_group_id += 1
# Arrange the text order within each group to form paragraphs
paragraphs = []
for group_id in set(box.group_id for box in text_boxes):
boxes_in_group = [box for box in text_boxes if box.group_id == group_id]
average_height = np.mean([box.height for box in boxes_in_group])
min_group_x = min([box.min_x for box in boxes_in_group])
max_group_x = max([box.max_x for box in boxes_in_group])
min_group_y = min([box.min_y for box in boxes_in_group])
max_group_y = max([box.max_y for box in boxes_in_group])
paragraph_text = ''
while boxes_in_group:
highest_y = min([box.center_y for box in boxes_in_group])
line_candidates = [box for box in boxes_in_group if box.center_y < highest_y + 0.4 * average_height]
# Determine the left-most or right-most box based on reading mode
if reading_mode == 'ltr':
left_most_x = min([box.min_x for box in line_candidates])
for box in line_candidates:
if box.min_x == left_most_x:
selected_box = box
elif reading_mode == 'rtl':
right_most_x = max([box.max_x for box in line_candidates])
for box in line_candidates:
if box.max_x == right_most_x:
selected_box = box
paragraph_text += ' ' + selected_box.text
boxes_in_group.remove(selected_box)
# Append the bounding box and text for the paragraph
paragraphs.append([[[min_group_x, min_group_y], [max_group_x, min_group_y], [max_group_x, max_group_y], [min_group_x, max_group_y]], paragraph_text.strip()])
return paragraphs
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
placeholder = col2.empty()
with placeholder, st.spinner('PDF өңделуде ...'):
temp_pdf_file = "./temp_pdf_file.pdf"
with open(temp_pdf_file, "wb") as f:
f.write(uploaded_file.read())
process_pdf(uploaded_file)
else:
placeholder = col2.empty()
with placeholder,st.spinner('Сурет өңделуде ...'):
image = Image.open(uploaded_file)
#with open(os.path.join("tempDir",image_file))
col1.image(image)
result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
result = get_paragraph(result)
result_text = line_separator.join([item[1] for item in result])
button_group_html = generateButtonGroup(result)
col2.write(button_group_html, unsafe_allow_html=True)
col2.markdown(result_text) |