Upload 3 files
Browse files- modules/latex2bbox_color.py +61 -25
- modules/latex_processor.py +71 -26
- modules/visual_matcher.py +44 -18
modules/latex2bbox_color.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import json
|
4 |
import shutil
|
5 |
import logging
|
@@ -69,6 +70,33 @@ formular_template = r"""
|
|
69 |
\end{document}
|
70 |
"""
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
def run_cmd(cmd, timeout_sec=30):
|
74 |
proc = subprocess.Popen(cmd, shell=True)
|
@@ -101,37 +129,41 @@ def crop_image(image_path, pad=8):
|
|
101 |
|
102 |
img = Image.open(image_path).convert("RGB").crop((x_min-pad, y_min-pad, x_max+pad, y_max+pad))
|
103 |
img.save(image_path)
|
104 |
-
|
105 |
def extrac_bbox_from_color_image(image_path, color_list):
|
106 |
-
img =
|
107 |
-
W, H = img.size
|
108 |
-
pixels = list(img.getdata())
|
109 |
-
|
110 |
bbox_list = []
|
111 |
for target_color in color_list:
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
except:
|
123 |
bbox_list.append([])
|
124 |
-
|
125 |
-
|
126 |
-
img = img.convert("L")
|
127 |
img_bw = img.point(lambda x: 255 if x == 255 else 0, '1')
|
128 |
-
img_bw.convert("RGB").save(image_path)
|
129 |
return bbox_list
|
130 |
|
|
|
|
|
|
|
131 |
|
132 |
def latex2bbox_color(input_arg):
|
133 |
latex, basename, output_path, temp_dir, total_color_list = input_arg
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
|
136 |
output_vis_path = os.path.join(output_path, 'vis', basename+'.png')
|
137 |
output_base_path = os.path.join(output_path, 'vis', basename+'_base.png')
|
@@ -140,6 +172,7 @@ def latex2bbox_color(input_arg):
|
|
140 |
return
|
141 |
|
142 |
try:
|
|
|
143 |
ret, new_latex = tokenize_latex(latex, middle_file=os.path.join(temp_dir, basename+'.txt'))
|
144 |
if not(ret and new_latex):
|
145 |
log = f"ERROR, Tokenize latex failed: {basename}."
|
@@ -164,7 +197,7 @@ def latex2bbox_color(input_arg):
|
|
164 |
paper_size = 4
|
165 |
else:
|
166 |
paper_size = 5
|
167 |
-
final_latex =
|
168 |
|
169 |
except Exception as e:
|
170 |
log = f"ERROR, Preprocess latex failed: {basename}; {e}."
|
@@ -198,18 +231,21 @@ def latex2bbox_color(input_arg):
|
|
198 |
vis = Image.open(output_base_path)
|
199 |
draw = ImageDraw.Draw(vis)
|
200 |
|
201 |
-
with open(output_bbox_path, 'w') as f:
|
202 |
for token, box in zip(token_list, bbox_list):
|
203 |
item = {
|
204 |
"bbox": box,
|
205 |
"token": token
|
206 |
}
|
207 |
-
f.write(json.dumps(item)+'\n')
|
208 |
|
209 |
if not box:
|
210 |
continue
|
211 |
x_min, y_min, x_max, y_max = box
|
212 |
draw.rectangle([x_min, y_min, x_max, y_max], fill=None, outline=(0,250,0), width=1)
|
213 |
-
|
|
|
|
|
|
|
214 |
|
215 |
vis.save(output_vis_path)
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
import cv2
|
4 |
import json
|
5 |
import shutil
|
6 |
import logging
|
|
|
70 |
\end{document}
|
71 |
"""
|
72 |
|
73 |
+
formular_template_zh = r"""
|
74 |
+
\documentclass[12pt]{article}
|
75 |
+
\usepackage[landscape]{geometry}
|
76 |
+
\usepackage{geometry}
|
77 |
+
\geometry{a<PaperSize>paper,scale=0.98}
|
78 |
+
\pagestyle{empty}
|
79 |
+
\usepackage{booktabs}
|
80 |
+
\usepackage{amsmath}
|
81 |
+
\usepackage{upgreek}
|
82 |
+
\usepackage{CJK}
|
83 |
+
\usepackage{amssymb}
|
84 |
+
\usepackage{xcolor}
|
85 |
+
\begin{document}
|
86 |
+
\makeatletter
|
87 |
+
\renewcommand*{\@textcolor}[3]{%%
|
88 |
+
\protect\leavevmode
|
89 |
+
\begingroup
|
90 |
+
\color#1{#2}#3%%
|
91 |
+
\endgroup
|
92 |
+
}
|
93 |
+
\makeatother
|
94 |
+
\begin{CJK}{UTF8}{gkai}
|
95 |
+
%s
|
96 |
+
\end{CJK}
|
97 |
+
\end{document}
|
98 |
+
"""
|
99 |
+
|
100 |
|
101 |
def run_cmd(cmd, timeout_sec=30):
|
102 |
proc = subprocess.Popen(cmd, shell=True)
|
|
|
129 |
|
130 |
img = Image.open(image_path).convert("RGB").crop((x_min-pad, y_min-pad, x_max+pad, y_max+pad))
|
131 |
img.save(image_path)
|
132 |
+
|
133 |
def extrac_bbox_from_color_image(image_path, color_list):
|
134 |
+
img = cv2.imread(image_path)
|
|
|
|
|
|
|
135 |
bbox_list = []
|
136 |
for target_color in color_list:
|
137 |
+
r, g, b = target_color
|
138 |
+
target_rgb = np.array([b, g, r], dtype=np.uint8)
|
139 |
+
mask = np.all(img == target_rgb, axis=2)
|
140 |
+
coords = np.argwhere(mask)
|
141 |
+
if coords.size > 0:
|
142 |
+
x_min, y_min = coords[:, 1].min(), coords[:, 0].min()
|
143 |
+
x_max, y_max = coords[:, 1].max(), coords[:, 0].max()
|
144 |
+
bbox_list.append([int(x_min-1), int(y_min-1), int(x_max+1), int(y_max+1)])
|
145 |
+
else:
|
|
|
|
|
146 |
bbox_list.append([])
|
147 |
+
|
148 |
+
img = Image.open(image_path).convert("RGB").convert("L")
|
|
|
149 |
img_bw = img.point(lambda x: 255 if x == 255 else 0, '1')
|
150 |
+
img_bw.convert("RGB").save(image_path)
|
151 |
return bbox_list
|
152 |
|
153 |
+
def contains_chinese(text):
|
154 |
+
# 匹配中文字符的正则表达式范围
|
155 |
+
return re.search(r'[\u4e00-\u9fff]', text) is not None
|
156 |
|
157 |
def latex2bbox_color(input_arg):
|
158 |
latex, basename, output_path, temp_dir, total_color_list = input_arg
|
159 |
+
if "tabular" in latex:
|
160 |
+
template = tabular_template
|
161 |
+
else:
|
162 |
+
if contains_chinese(latex):
|
163 |
+
template = formular_template_zh
|
164 |
+
latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
|
165 |
+
else:
|
166 |
+
template = formular_template
|
167 |
output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
|
168 |
output_vis_path = os.path.join(output_path, 'vis', basename+'.png')
|
169 |
output_base_path = os.path.join(output_path, 'vis', basename+'_base.png')
|
|
|
172 |
return
|
173 |
|
174 |
try:
|
175 |
+
latex = latex.replace("\n", " ")
|
176 |
ret, new_latex = tokenize_latex(latex, middle_file=os.path.join(temp_dir, basename+'.txt'))
|
177 |
if not(ret and new_latex):
|
178 |
log = f"ERROR, Tokenize latex failed: {basename}."
|
|
|
197 |
paper_size = 4
|
198 |
else:
|
199 |
paper_size = 5
|
200 |
+
final_latex = template.replace("<PaperSize>", str(paper_size)) % rgb_latex
|
201 |
|
202 |
except Exception as e:
|
203 |
log = f"ERROR, Preprocess latex failed: {basename}; {e}."
|
|
|
231 |
vis = Image.open(output_base_path)
|
232 |
draw = ImageDraw.Draw(vis)
|
233 |
|
234 |
+
with open(output_bbox_path, 'w', encoding='utf-8') as f:
|
235 |
for token, box in zip(token_list, bbox_list):
|
236 |
item = {
|
237 |
"bbox": box,
|
238 |
"token": token
|
239 |
}
|
240 |
+
f.write(json.dumps(item, ensure_ascii=False)+'\n')
|
241 |
|
242 |
if not box:
|
243 |
continue
|
244 |
x_min, y_min, x_max, y_max = box
|
245 |
draw.rectangle([x_min, y_min, x_max, y_max], fill=None, outline=(0,250,0), width=1)
|
246 |
+
try:
|
247 |
+
draw.text((x_min, y_min), token, (250,0,0))
|
248 |
+
except:
|
249 |
+
pass
|
250 |
|
251 |
vis.save(output_vis_path)
|
modules/latex_processor.py
CHANGED
@@ -8,14 +8,14 @@ from PIL import Image
|
|
8 |
|
9 |
|
10 |
SKIP_PATTERNS = [r'\{', r'\}', r'[\[\]]', r'\\begin\{.*?\}', r'\\end\{.*?\}', r'\^', r'\_', r'\\.*rule.*', r'\\.*line.*', r'\[[\-.0-9]+[epm][xtm]\]']
|
11 |
-
SKIP_Tokens = ['\\', '\\\\', '\\index', '\\a', '&', '$', '\\multirow', '\\def', '\\raggedright', '\\url', '\\cr', '\\ensuremath', '\\left', '\\right',
|
12 |
-
'\\mathchoice', '\\scriptstyle', '\\displaystyle', '\\qquad', '\\quad', '\\,', '\\!', '~', '\\boldmath']
|
13 |
-
PHANTOM_Tokens = ['\\fontfamily', '\\vphantom', '\\phantom', '\\rowcolor', '\\ref']
|
14 |
TWO_Tail_Tokens = ['\\frac', '\\binom']
|
15 |
AB_Tail_Tokens = ['\\xrightarrow', '\\xleftarrow', '\\sqrt'] # special token \xxx [] {}
|
16 |
TWO_Tail_Invisb_Tokens = ['\\overset', '\\underset', '\\stackrel']
|
17 |
ONE_Tail_Tokens = ['\\widetilde', '\\overline', '\\hat', '\\widehat', '\\tilde', '\\Tilde', '\\dot', '\\bar', '\\vec', '\\underline', '\\underbrace', '\\check',
|
18 |
-
'\\breve', '\\Bar', '\\Vec', '\\mathring', '\\ddot']
|
19 |
ONE_Tail_Invisb_Tokens = ['\\boldsymbol', '\\pmb', '\\textbf', '\\mathrm', '\\mathbf', '\\mathbb', '\\mathcal', '\\textmd', '\\texttt', '\\textnormal',
|
20 |
'\\text', '\\textit', '\\textup', '\\mathop', '\\mathbin', '\\smash', '\\operatorname', '\\textrm', '\\mathfrak', '\\emph',
|
21 |
'\\textsf', '\\textsc']
|
@@ -150,29 +150,74 @@ def normalize_latex(l, rm_trail=False):
|
|
150 |
for bef, aft in zip(old_token, new_token):
|
151 |
l = l.replace(bef, aft)
|
152 |
|
153 |
-
# TODO token such \not= should be one token
|
154 |
-
pattern = r'\\not [<>+=\-]'
|
155 |
-
old_token = re.findall(pattern, l, re.DOTALL)
|
156 |
-
new_token = [item.replace(" ", "") for item in old_token]
|
157 |
-
for bef, aft in zip(old_token, new_token):
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
# TODO tokens such as \dots \exp \sinh, split them to parts, so the bbox match will be easier.
|
161 |
|
162 |
l = " "+l+" "
|
163 |
-
l =
|
164 |
-
l =
|
165 |
-
l =
|
166 |
-
l =
|
167 |
-
l =
|
168 |
-
l =
|
169 |
-
l =
|
170 |
-
l =
|
171 |
-
l =
|
172 |
-
l =
|
173 |
-
l =
|
174 |
-
l =
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
# ** token such as \big( should be one token
|
177 |
pattern = r'\\[Bb]ig[g]?[glrm]? [(){}|\[\]] '
|
178 |
old_token = re.findall(pattern, l, re.DOTALL)
|
@@ -235,12 +280,12 @@ def normalize_latex(l, rm_trail=False):
|
|
235 |
for bef, aft in zip(old_token, new_token):
|
236 |
l = l.replace(bef, "{ "+aft[1:-1]+" }")
|
237 |
|
238 |
-
# ** \
|
239 |
-
pattern = r'\\
|
240 |
old_token = re.findall(pattern, l, re.DOTALL)
|
241 |
new_token = [item.replace(" ", "") for item in old_token]
|
242 |
for bef, aft in zip(old_token, new_token):
|
243 |
-
l = l.replace(bef, aft
|
244 |
|
245 |
# ** \specialrule{1pt}{2pt}{2pt}, special lines, shoudle be combined as one token
|
246 |
pattern = r'\\specialrule {[ .0-9a-z]+} {[ .0-9a-z]+} {[ .0-9a-z]+}'
|
|
|
8 |
|
9 |
|
10 |
SKIP_PATTERNS = [r'\{', r'\}', r'[\[\]]', r'\\begin\{.*?\}', r'\\end\{.*?\}', r'\^', r'\_', r'\\.*rule.*', r'\\.*line.*', r'\[[\-.0-9]+[epm][xtm]\]']
|
11 |
+
SKIP_Tokens = ['\\', '\\\\', '\\index', '\\a', '&', '$', '\\multirow', '\\def', '\\edef', '\\raggedright', '\\url', '\\cr', '\\ensuremath', '\\left', '\\right',
|
12 |
+
'\\mathchoice', '\\scriptstyle', '\\displaystyle', '\\qquad', '\\quad', '\\,', '\\!', '~', '\\boldmath', '\\gdef', '\\today', '\\the']
|
13 |
+
PHANTOM_Tokens = ['\\fontfamily', '\\vphantom', '\\phantom', '\\rowcolor', '\\ref', '\\thesubequation', '\\global', '\\theboldgroup']
|
14 |
TWO_Tail_Tokens = ['\\frac', '\\binom']
|
15 |
AB_Tail_Tokens = ['\\xrightarrow', '\\xleftarrow', '\\sqrt'] # special token \xxx [] {}
|
16 |
TWO_Tail_Invisb_Tokens = ['\\overset', '\\underset', '\\stackrel']
|
17 |
ONE_Tail_Tokens = ['\\widetilde', '\\overline', '\\hat', '\\widehat', '\\tilde', '\\Tilde', '\\dot', '\\bar', '\\vec', '\\underline', '\\underbrace', '\\check',
|
18 |
+
'\\breve', '\\Bar', '\\Vec', '\\mathring', '\\ddot', '\\Ddot', '\\dddot', '\\ddddot']
|
19 |
ONE_Tail_Invisb_Tokens = ['\\boldsymbol', '\\pmb', '\\textbf', '\\mathrm', '\\mathbf', '\\mathbb', '\\mathcal', '\\textmd', '\\texttt', '\\textnormal',
|
20 |
'\\text', '\\textit', '\\textup', '\\mathop', '\\mathbin', '\\smash', '\\operatorname', '\\textrm', '\\mathfrak', '\\emph',
|
21 |
'\\textsf', '\\textsc']
|
|
|
150 |
for bef, aft in zip(old_token, new_token):
|
151 |
l = l.replace(bef, aft)
|
152 |
|
153 |
+
# # TODO token such \not= should be one token
|
154 |
+
# pattern = r'\\not [<>+=\-]'
|
155 |
+
# old_token = re.findall(pattern, l, re.DOTALL)
|
156 |
+
# new_token = [item.replace(" ", "") for item in old_token]
|
157 |
+
# for bef, aft in zip(old_token, new_token):
|
158 |
+
# l = l.replace(bef, aft)
|
159 |
+
|
160 |
+
# # TODO \not xx shoudle be combined as one token
|
161 |
+
# pattern = r'\\not [\\=\<\>][^ ]+ '
|
162 |
+
# old_token = re.findall(pattern, l, re.DOTALL)
|
163 |
+
# new_token = [item.replace(" ", "") for item in old_token]
|
164 |
+
# for bef, aft in zip(old_token, new_token):
|
165 |
+
# l = l.replace(bef, aft+" ")
|
166 |
|
167 |
# TODO tokens such as \dots \exp \sinh, split them to parts, so the bbox match will be easier.
|
168 |
|
169 |
l = " "+l+" "
|
170 |
+
l = re.sub(r'(?<=\s)--(?=\s)', r'- -', l)
|
171 |
+
l = re.sub(r'(?<=\s)---(?=\s)', r'- - -', l)
|
172 |
+
l = re.sub(r'(?<=\s)…(?=\s)', r'. . .', l)
|
173 |
+
l = re.sub(r'(?<=\s)\\ldots(?=\s)', r'. . .', l)
|
174 |
+
l = re.sub(r'(?<=\s)\\hdots(?=\s)', r'. . .', l)
|
175 |
+
l = re.sub(r'(?<=\s)\\cdots(?=\s)', r'. . .', l)
|
176 |
+
l = re.sub(r'(?<=\s)\\dddot(?=\s)', r'. . .', l)
|
177 |
+
l = re.sub(r'(?<=\s)\\dots(?=\s)', r'. . .', l)
|
178 |
+
l = re.sub(r'(?<=\s)\\dotsc(?=\s)', r'. . .', l)
|
179 |
+
l = re.sub(r'(?<=\s)\\dotsi(?=\s)', r'. . .', l)
|
180 |
+
l = re.sub(r'(?<=\s)\\dotsm(?=\s)', r'. . .', l)
|
181 |
+
l = re.sub(r'(?<=\s)\\dotso(?=\s)', r'. . .', l)
|
182 |
+
l = re.sub(r'(?<=\s)\\dotsb(?=\s)', r'. . .', l)
|
183 |
+
l = re.sub(r'(?<=\s)\\mathellipsis(?=\s)', r'. . .', l)
|
184 |
+
l = re.sub(r'(?<=\s)\\ex(?=\s)', r'\\mathrm { e x }', l)
|
185 |
+
l = re.sub(r'(?<=\s)\\ln(?=\s)', r'\\mathrm { l n }', l)
|
186 |
+
l = re.sub(r'(?<=\s)\\lg(?=\s)', r'\\mathrm { l g }', l)
|
187 |
+
l = re.sub(r'(?<=\s)\\cot(?=\s)', r'\\mathrm { c o t }', l)
|
188 |
+
l = re.sub(r'(?<=\s)\\mod(?=\s)', r'\\mathrm { m o d }', l)
|
189 |
+
l = re.sub(r'(?<=\s)\\bmod(?=\s)', r'\\mathrm { m o d }', l)
|
190 |
+
l = re.sub(r'(?<=\s)\\pmod(?=\s)', r'\\mathrm { m o d }', l) # \pmod 其实和mod不一样,但是不太好处理,暂时替换为\mod
|
191 |
+
l = re.sub(r'(?<=\s)\\min(?=\s)', r'\\mathrm { m i n }', l)
|
192 |
+
l = re.sub(r'(?<=\s)\\max(?=\s)', r'\\mathrm { m a x }', l)
|
193 |
+
l = re.sub(r'(?<=\s)\\ker(?=\s)', r'\\mathrm { k e r }', l)
|
194 |
+
l = re.sub(r'(?<=\s)\\hom(?=\s)', r'\\mathrm { h o m }', l)
|
195 |
+
l = re.sub(r'(?<=\s)\\sec(?=\s)', r'\\mathrm { s e c }', l)
|
196 |
+
l = re.sub(r'(?<=\s)\\scs(?=\s)', r'\\mathrm { s c s }', l)
|
197 |
+
l = re.sub(r'(?<=\s)\\csc(?=\s)', r'\\mathrm { c s c }', l)
|
198 |
+
l = re.sub(r'(?<=\s)\\deg(?=\s)', r'\\mathrm { d e g }', l)
|
199 |
+
l = re.sub(r'(?<=\s)\\arg(?=\s)', r'\\mathrm { a r g }', l)
|
200 |
+
l = re.sub(r'(?<=\s)\\log(?=\s)', r'\\mathrm { l o g }', l)
|
201 |
+
l = re.sub(r'(?<=\s)\\dim(?=\s)', r'\\mathrm { d i m }', l)
|
202 |
+
l = re.sub(r'(?<=\s)\\exp(?=\s)', r'\\mathrm { e x p }', l)
|
203 |
+
l = re.sub(r'(?<=\s)\\sin(?=\s)', r'\\mathrm { s i n }', l)
|
204 |
+
l = re.sub(r'(?<=\s)\\cos(?=\s)', r'\\mathrm { c o s }', l)
|
205 |
+
l = re.sub(r'(?<=\s)\\tan(?=\s)', r'\\mathrm { t a n }', l)
|
206 |
+
l = re.sub(r'(?<=\s)\\tanh(?=\s)', r'\\mathrm { t a n h }', l)
|
207 |
+
l = re.sub(r'(?<=\s)\\cosh(?=\s)', r'\\mathrm { c o s h }', l)
|
208 |
+
l = re.sub(r'(?<=\s)\\sinh(?=\s)', r'\\mathrm { s i n h }', l)
|
209 |
+
l = re.sub(r'(?<=\s)\\coth(?=\s)', r'\\mathrm { c o t h }', l)
|
210 |
+
l = re.sub(r'(?<=\s)\\arcsin(?=\s)', r'\\mathrm { a r c s i n }', l)
|
211 |
+
l = re.sub(r'(?<=\s)\\arccos(?=\s)', r'\\mathrm { a r c c o s }', l)
|
212 |
+
l = re.sub(r'(?<=\s)\\arctan(?=\s)', r'\\mathrm { a r c t a n }', l)
|
213 |
+
|
214 |
+
# ** token such as \string xxx should be one token
|
215 |
+
pattern = r'\\string [^ ]+ '
|
216 |
+
old_token = re.findall(pattern, l, re.DOTALL)
|
217 |
+
new_token = [item.replace(" ", "") for item in old_token]
|
218 |
+
for bef, aft in zip(old_token, new_token):
|
219 |
+
l = l.replace(bef, aft+" ")
|
220 |
+
|
221 |
# ** token such as \big( should be one token
|
222 |
pattern = r'\\[Bb]ig[g]?[glrm]? [(){}|\[\]] '
|
223 |
old_token = re.findall(pattern, l, re.DOTALL)
|
|
|
280 |
for bef, aft in zip(old_token, new_token):
|
281 |
l = l.replace(bef, "{ "+aft[1:-1]+" }")
|
282 |
|
283 |
+
# ** \rule{1pt}{2pt} lines, shoudle be combined as one token and do not render
|
284 |
+
pattern = r'\\rule {[ .0-9a-z]+} {[ .0-9a-z]+}'
|
285 |
old_token = re.findall(pattern, l, re.DOTALL)
|
286 |
new_token = [item.replace(" ", "") for item in old_token]
|
287 |
for bef, aft in zip(old_token, new_token):
|
288 |
+
l = l.replace(bef, aft)
|
289 |
|
290 |
# ** \specialrule{1pt}{2pt}{2pt}, special lines, shoudle be combined as one token
|
291 |
pattern = r'\\specialrule {[ .0-9a-z]+} {[ .0-9a-z]+} {[ .0-9a-z]+}'
|
modules/visual_matcher.py
CHANGED
@@ -42,41 +42,67 @@ def norm_coords(x, left, right):
|
|
42 |
|
43 |
def norm_same_token(token):
|
44 |
special_map = {
|
|
|
|
|
45 |
"\\cdot": ".",
|
|
|
|
|
46 |
"\\mid": "|",
|
47 |
-
"\\
|
48 |
"\\top": "T",
|
49 |
"\\Tilde": "\\tilde",
|
50 |
-
"\\cdots": "\\dots",
|
51 |
"\\prime": "'",
|
52 |
"\\ast": "*",
|
53 |
"\\left<": "\\langle",
|
54 |
-
"\\right>": "\\rangle"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
}
|
56 |
-
if token in special_map.keys():
|
57 |
-
token = special_map[token]
|
58 |
if token.startswith('\\left') or token.startswith('\\right'):
|
59 |
-
token
|
|
|
60 |
if token.startswith('\\big') or token.startswith('\\Big'):
|
61 |
if "\\" in token[4:]:
|
62 |
token = "\\"+token[4:].split("\\")[-1]
|
63 |
else:
|
64 |
token = token[-1]
|
65 |
-
|
66 |
-
|
67 |
-
return token[0:-1]
|
68 |
-
if token in ['\\lVert', '\\rVert', '\\Vert']:
|
69 |
-
return '\\|'
|
70 |
-
if token in ['\\lvert', '\\rvert', '\\vert']:
|
71 |
-
return '|'
|
72 |
-
if token.endswith("rightarrow"):
|
73 |
-
return "\\rightarrow"
|
74 |
-
if token.endswith("leftarrow"):
|
75 |
-
return "\\leftarrow"
|
76 |
if token.startswith('\\wide'):
|
77 |
return token.replace("wide", "")
|
78 |
if token.startswith('\\var'):
|
79 |
-
return token.replace("
|
|
|
|
|
80 |
return token
|
81 |
|
82 |
|
|
|
42 |
|
43 |
def norm_same_token(token):
|
44 |
special_map = {
|
45 |
+
"\\dot": ".",
|
46 |
+
"\\Dot": ".",
|
47 |
"\\cdot": ".",
|
48 |
+
"\\cdotp": ".",
|
49 |
+
"\\ldotp": ".",
|
50 |
"\\mid": "|",
|
51 |
+
"\\rightarrow": "\\to",
|
52 |
"\\top": "T",
|
53 |
"\\Tilde": "\\tilde",
|
|
|
54 |
"\\prime": "'",
|
55 |
"\\ast": "*",
|
56 |
"\\left<": "\\langle",
|
57 |
+
"\\right>": "\\rangle",
|
58 |
+
"\\lbrace": "\{",
|
59 |
+
"\\rbrace": "\}",
|
60 |
+
"\\lbrack": "[",
|
61 |
+
"\\rbrack": "]",
|
62 |
+
"\\blackslash": "/",
|
63 |
+
"\\slash": "/",
|
64 |
+
"\\leq": "\\le",
|
65 |
+
"\\geq": "\\ge",
|
66 |
+
"\\neq": "\\ne",
|
67 |
+
"\\Vert": "\\|",
|
68 |
+
"\\lVert": "\\|",
|
69 |
+
"\\rVert": "\\|",
|
70 |
+
"\\vert": "|",
|
71 |
+
"\\lvert": "|",
|
72 |
+
"\\rvert": "|",
|
73 |
+
"\\colon": ":",
|
74 |
+
"\\Ddot": "\\ddot",
|
75 |
+
"\\Bar": "\\bar",
|
76 |
+
"\\Vec": "\\vec",
|
77 |
+
"\\parallel": "\\|",
|
78 |
+
"\\dag": "\\dagger",
|
79 |
+
"\\ddag": "\\ddagger",
|
80 |
+
"\\textlangle": "<",
|
81 |
+
"\\textrangle": ">",
|
82 |
+
"\\textgreater": ">",
|
83 |
+
"\\textless": "<",
|
84 |
+
"\\textbackslash": "n",
|
85 |
+
"\\textunderscore": "_",
|
86 |
+
"\\=": "_",
|
87 |
+
"\\neg": "\\lnot",
|
88 |
+
"\\neq": "\\not=",
|
89 |
}
|
|
|
|
|
90 |
if token.startswith('\\left') or token.startswith('\\right'):
|
91 |
+
if "arrow" not in token and "<" not in token and ">" not in token and "harpoon" not in token:
|
92 |
+
token = token.replace("\\left", "").replace("\\right", "")
|
93 |
if token.startswith('\\big') or token.startswith('\\Big'):
|
94 |
if "\\" in token[4:]:
|
95 |
token = "\\"+token[4:].split("\\")[-1]
|
96 |
else:
|
97 |
token = token[-1]
|
98 |
+
if token in special_map.keys():
|
99 |
+
token = special_map[token]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
if token.startswith('\\wide'):
|
101 |
return token.replace("wide", "")
|
102 |
if token.startswith('\\var'):
|
103 |
+
return token.replace("var", "")
|
104 |
+
if token.startswith('\\string'):
|
105 |
+
return token.replace("\\string", "")
|
106 |
return token
|
107 |
|
108 |
|