wufan commited on
Commit
1924325
·
verified ·
1 Parent(s): df6ee98

Upload 3 files

Browse files
modules/latex2bbox_color.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import re
 
3
  import json
4
  import shutil
5
  import logging
@@ -69,6 +70,33 @@ formular_template = r"""
69
  \end{document}
70
  """
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def run_cmd(cmd, timeout_sec=30):
74
  proc = subprocess.Popen(cmd, shell=True)
@@ -101,37 +129,41 @@ def crop_image(image_path, pad=8):
101
 
102
  img = Image.open(image_path).convert("RGB").crop((x_min-pad, y_min-pad, x_max+pad, y_max+pad))
103
  img.save(image_path)
104
-
105
  def extrac_bbox_from_color_image(image_path, color_list):
106
- img = Image.open(image_path).convert("RGB")
107
- W, H = img.size
108
- pixels = list(img.getdata())
109
-
110
  bbox_list = []
111
  for target_color in color_list:
112
- target_pixels = [ i for i, pixel in enumerate(pixels)if pixel == target_color ]
113
- x_list = []
114
- y_list = []
115
- for idx in target_pixels:
116
- x_list.append(idx % W)
117
- y_list.append(idx // W)
118
- try:
119
- y_min, y_max, x_min, x_max = min(y_list), max(y_list), min(x_list), max(x_list)
120
- bbox_list.append([x_min-1, y_min-1, x_max+1, y_max+1])
121
-
122
- except:
123
  bbox_list.append([])
124
- continue
125
-
126
- img = img.convert("L")
127
  img_bw = img.point(lambda x: 255 if x == 255 else 0, '1')
128
- img_bw.convert("RGB").save(image_path)
129
  return bbox_list
130
 
 
 
 
131
 
132
  def latex2bbox_color(input_arg):
133
  latex, basename, output_path, temp_dir, total_color_list = input_arg
134
- template = tabular_template if "tabular" in latex else formular_template
 
 
 
 
 
 
 
135
  output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
136
  output_vis_path = os.path.join(output_path, 'vis', basename+'.png')
137
  output_base_path = os.path.join(output_path, 'vis', basename+'_base.png')
@@ -140,6 +172,7 @@ def latex2bbox_color(input_arg):
140
  return
141
 
142
  try:
 
143
  ret, new_latex = tokenize_latex(latex, middle_file=os.path.join(temp_dir, basename+'.txt'))
144
  if not(ret and new_latex):
145
  log = f"ERROR, Tokenize latex failed: {basename}."
@@ -164,7 +197,7 @@ def latex2bbox_color(input_arg):
164
  paper_size = 4
165
  else:
166
  paper_size = 5
167
- final_latex = formular_template.replace("<PaperSize>", str(paper_size)) % rgb_latex
168
 
169
  except Exception as e:
170
  log = f"ERROR, Preprocess latex failed: {basename}; {e}."
@@ -198,18 +231,21 @@ def latex2bbox_color(input_arg):
198
  vis = Image.open(output_base_path)
199
  draw = ImageDraw.Draw(vis)
200
 
201
- with open(output_bbox_path, 'w') as f:
202
  for token, box in zip(token_list, bbox_list):
203
  item = {
204
  "bbox": box,
205
  "token": token
206
  }
207
- f.write(json.dumps(item)+'\n')
208
 
209
  if not box:
210
  continue
211
  x_min, y_min, x_max, y_max = box
212
  draw.rectangle([x_min, y_min, x_max, y_max], fill=None, outline=(0,250,0), width=1)
213
- draw.text((x_min, y_min), token, (250,0,0))
 
 
 
214
 
215
  vis.save(output_vis_path)
 
1
  import os
2
  import re
3
+ import cv2
4
  import json
5
  import shutil
6
  import logging
 
70
  \end{document}
71
  """
72
 
73
+ formular_template_zh = r"""
74
+ \documentclass[12pt]{article}
75
+ \usepackage[landscape]{geometry}
76
+ \usepackage{geometry}
77
+ \geometry{a<PaperSize>paper,scale=0.98}
78
+ \pagestyle{empty}
79
+ \usepackage{booktabs}
80
+ \usepackage{amsmath}
81
+ \usepackage{upgreek}
82
+ \usepackage{CJK}
83
+ \usepackage{amssymb}
84
+ \usepackage{xcolor}
85
+ \begin{document}
86
+ \makeatletter
87
+ \renewcommand*{\@textcolor}[3]{%%
88
+ \protect\leavevmode
89
+ \begingroup
90
+ \color#1{#2}#3%%
91
+ \endgroup
92
+ }
93
+ \makeatother
94
+ \begin{CJK}{UTF8}{gkai}
95
+ %s
96
+ \end{CJK}
97
+ \end{document}
98
+ """
99
+
100
 
101
  def run_cmd(cmd, timeout_sec=30):
102
  proc = subprocess.Popen(cmd, shell=True)
 
129
 
130
  img = Image.open(image_path).convert("RGB").crop((x_min-pad, y_min-pad, x_max+pad, y_max+pad))
131
  img.save(image_path)
132
+
133
  def extrac_bbox_from_color_image(image_path, color_list):
134
+ img = cv2.imread(image_path)
 
 
 
135
  bbox_list = []
136
  for target_color in color_list:
137
+ r, g, b = target_color
138
+ target_rgb = np.array([b, g, r], dtype=np.uint8)
139
+ mask = np.all(img == target_rgb, axis=2)
140
+ coords = np.argwhere(mask)
141
+ if coords.size > 0:
142
+ x_min, y_min = coords[:, 1].min(), coords[:, 0].min()
143
+ x_max, y_max = coords[:, 1].max(), coords[:, 0].max()
144
+ bbox_list.append([int(x_min-1), int(y_min-1), int(x_max+1), int(y_max+1)])
145
+ else:
 
 
146
  bbox_list.append([])
147
+
148
+ img = Image.open(image_path).convert("RGB").convert("L")
 
149
  img_bw = img.point(lambda x: 255 if x == 255 else 0, '1')
150
+ img_bw.convert("RGB").save(image_path)
151
  return bbox_list
152
 
153
+ def contains_chinese(text):
154
+ # 匹配中文字符的正则表达式范围
155
+ return re.search(r'[\u4e00-\u9fff]', text) is not None
156
 
157
  def latex2bbox_color(input_arg):
158
  latex, basename, output_path, temp_dir, total_color_list = input_arg
159
+ if "tabular" in latex:
160
+ template = tabular_template
161
+ else:
162
+ if contains_chinese(latex):
163
+ template = formular_template_zh
164
+ latex = latex.replace(",", ", ").replace(":", ": ").replace(";", "; ")
165
+ else:
166
+ template = formular_template
167
  output_bbox_path = os.path.join(output_path, 'bbox', basename+'.jsonl')
168
  output_vis_path = os.path.join(output_path, 'vis', basename+'.png')
169
  output_base_path = os.path.join(output_path, 'vis', basename+'_base.png')
 
172
  return
173
 
174
  try:
175
+ latex = latex.replace("\n", " ")
176
  ret, new_latex = tokenize_latex(latex, middle_file=os.path.join(temp_dir, basename+'.txt'))
177
  if not(ret and new_latex):
178
  log = f"ERROR, Tokenize latex failed: {basename}."
 
197
  paper_size = 4
198
  else:
199
  paper_size = 5
200
+ final_latex = template.replace("<PaperSize>", str(paper_size)) % rgb_latex
201
 
202
  except Exception as e:
203
  log = f"ERROR, Preprocess latex failed: {basename}; {e}."
 
231
  vis = Image.open(output_base_path)
232
  draw = ImageDraw.Draw(vis)
233
 
234
+ with open(output_bbox_path, 'w', encoding='utf-8') as f:
235
  for token, box in zip(token_list, bbox_list):
236
  item = {
237
  "bbox": box,
238
  "token": token
239
  }
240
+ f.write(json.dumps(item, ensure_ascii=False)+'\n')
241
 
242
  if not box:
243
  continue
244
  x_min, y_min, x_max, y_max = box
245
  draw.rectangle([x_min, y_min, x_max, y_max], fill=None, outline=(0,250,0), width=1)
246
+ try:
247
+ draw.text((x_min, y_min), token, (250,0,0))
248
+ except:
249
+ pass
250
 
251
  vis.save(output_vis_path)
modules/latex_processor.py CHANGED
@@ -8,14 +8,14 @@ from PIL import Image
8
 
9
 
10
  SKIP_PATTERNS = [r'\{', r'\}', r'[\[\]]', r'\\begin\{.*?\}', r'\\end\{.*?\}', r'\^', r'\_', r'\\.*rule.*', r'\\.*line.*', r'\[[\-.0-9]+[epm][xtm]\]']
11
- SKIP_Tokens = ['\\', '\\\\', '\\index', '\\a', '&', '$', '\\multirow', '\\def', '\\raggedright', '\\url', '\\cr', '\\ensuremath', '\\left', '\\right',
12
- '\\mathchoice', '\\scriptstyle', '\\displaystyle', '\\qquad', '\\quad', '\\,', '\\!', '~', '\\boldmath']
13
- PHANTOM_Tokens = ['\\fontfamily', '\\vphantom', '\\phantom', '\\rowcolor', '\\ref']
14
  TWO_Tail_Tokens = ['\\frac', '\\binom']
15
  AB_Tail_Tokens = ['\\xrightarrow', '\\xleftarrow', '\\sqrt'] # special token \xxx [] {}
16
  TWO_Tail_Invisb_Tokens = ['\\overset', '\\underset', '\\stackrel']
17
  ONE_Tail_Tokens = ['\\widetilde', '\\overline', '\\hat', '\\widehat', '\\tilde', '\\Tilde', '\\dot', '\\bar', '\\vec', '\\underline', '\\underbrace', '\\check',
18
- '\\breve', '\\Bar', '\\Vec', '\\mathring', '\\ddot']
19
  ONE_Tail_Invisb_Tokens = ['\\boldsymbol', '\\pmb', '\\textbf', '\\mathrm', '\\mathbf', '\\mathbb', '\\mathcal', '\\textmd', '\\texttt', '\\textnormal',
20
  '\\text', '\\textit', '\\textup', '\\mathop', '\\mathbin', '\\smash', '\\operatorname', '\\textrm', '\\mathfrak', '\\emph',
21
  '\\textsf', '\\textsc']
@@ -150,29 +150,74 @@ def normalize_latex(l, rm_trail=False):
150
  for bef, aft in zip(old_token, new_token):
151
  l = l.replace(bef, aft)
152
 
153
- # TODO token such \not= should be one token
154
- pattern = r'\\not [<>+=\-]'
155
- old_token = re.findall(pattern, l, re.DOTALL)
156
- new_token = [item.replace(" ", "") for item in old_token]
157
- for bef, aft in zip(old_token, new_token):
158
- l = l.replace(bef, aft)
 
 
 
 
 
 
 
159
 
160
  # TODO tokens such as \dots \exp \sinh, split them to parts, so the bbox match will be easier.
161
 
162
  l = " "+l+" "
163
- l = l.replace(" \\ldots ", " . . . ")
164
- l = l.replace(" \\cdots ", " . . . ")
165
- l = l.replace(" \\dots ", " . . . ")
166
- l = l.replace(" \\dotsb ", " . . . ")
167
- l = l.replace(" \\log ", " \\mathrm { l o g } ")
168
- l = l.replace(" \\exp ", " \\mathrm { e x p } ")
169
- l = l.replace(" \\sin ", " \\mathrm { s i n } ")
170
- l = l.replace(" \\cos ", " \\mathrm { c o s } ")
171
- l = l.replace(" \\tan ", " \\mathrm { t a n } ")
172
- l = l.replace(" \\tanh ", " \\mathrm { t a n h } ")
173
- l = l.replace(" \\cosh ", " \\mathrm { c o s h } ")
174
- l = l.replace(" \\sinh ", " \\mathrm { s i n h } ")
175
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # ** token such as \big( should be one token
177
  pattern = r'\\[Bb]ig[g]?[glrm]? [(){}|\[\]] '
178
  old_token = re.findall(pattern, l, re.DOTALL)
@@ -235,12 +280,12 @@ def normalize_latex(l, rm_trail=False):
235
  for bef, aft in zip(old_token, new_token):
236
  l = l.replace(bef, "{ "+aft[1:-1]+" }")
237
 
238
- # ** \not xx shoudle be combined as one token
239
- pattern = r'\\not [\\=\<\>][^ ]+ '
240
  old_token = re.findall(pattern, l, re.DOTALL)
241
  new_token = [item.replace(" ", "") for item in old_token]
242
  for bef, aft in zip(old_token, new_token):
243
- l = l.replace(bef, aft+" ")
244
 
245
  # ** \specialrule{1pt}{2pt}{2pt}, special lines, shoudle be combined as one token
246
  pattern = r'\\specialrule {[ .0-9a-z]+} {[ .0-9a-z]+} {[ .0-9a-z]+}'
 
8
 
9
 
10
  SKIP_PATTERNS = [r'\{', r'\}', r'[\[\]]', r'\\begin\{.*?\}', r'\\end\{.*?\}', r'\^', r'\_', r'\\.*rule.*', r'\\.*line.*', r'\[[\-.0-9]+[epm][xtm]\]']
11
+ SKIP_Tokens = ['\\', '\\\\', '\\index', '\\a', '&', '$', '\\multirow', '\\def', '\\edef', '\\raggedright', '\\url', '\\cr', '\\ensuremath', '\\left', '\\right',
12
+ '\\mathchoice', '\\scriptstyle', '\\displaystyle', '\\qquad', '\\quad', '\\,', '\\!', '~', '\\boldmath', '\\gdef', '\\today', '\\the']
13
+ PHANTOM_Tokens = ['\\fontfamily', '\\vphantom', '\\phantom', '\\rowcolor', '\\ref', '\\thesubequation', '\\global', '\\theboldgroup']
14
  TWO_Tail_Tokens = ['\\frac', '\\binom']
15
  AB_Tail_Tokens = ['\\xrightarrow', '\\xleftarrow', '\\sqrt'] # special token \xxx [] {}
16
  TWO_Tail_Invisb_Tokens = ['\\overset', '\\underset', '\\stackrel']
17
  ONE_Tail_Tokens = ['\\widetilde', '\\overline', '\\hat', '\\widehat', '\\tilde', '\\Tilde', '\\dot', '\\bar', '\\vec', '\\underline', '\\underbrace', '\\check',
18
+ '\\breve', '\\Bar', '\\Vec', '\\mathring', '\\ddot', '\\Ddot', '\\dddot', '\\ddddot']
19
  ONE_Tail_Invisb_Tokens = ['\\boldsymbol', '\\pmb', '\\textbf', '\\mathrm', '\\mathbf', '\\mathbb', '\\mathcal', '\\textmd', '\\texttt', '\\textnormal',
20
  '\\text', '\\textit', '\\textup', '\\mathop', '\\mathbin', '\\smash', '\\operatorname', '\\textrm', '\\mathfrak', '\\emph',
21
  '\\textsf', '\\textsc']
 
150
  for bef, aft in zip(old_token, new_token):
151
  l = l.replace(bef, aft)
152
 
153
+ # # TODO token such \not= should be one token
154
+ # pattern = r'\\not [<>+=\-]'
155
+ # old_token = re.findall(pattern, l, re.DOTALL)
156
+ # new_token = [item.replace(" ", "") for item in old_token]
157
+ # for bef, aft in zip(old_token, new_token):
158
+ # l = l.replace(bef, aft)
159
+
160
+ # # TODO \not xx shoudle be combined as one token
161
+ # pattern = r'\\not [\\=\<\>][^ ]+ '
162
+ # old_token = re.findall(pattern, l, re.DOTALL)
163
+ # new_token = [item.replace(" ", "") for item in old_token]
164
+ # for bef, aft in zip(old_token, new_token):
165
+ # l = l.replace(bef, aft+" ")
166
 
167
  # TODO tokens such as \dots \exp \sinh, split them to parts, so the bbox match will be easier.
168
 
169
  l = " "+l+" "
170
+ l = re.sub(r'(?<=\s)--(?=\s)', r'- -', l)
171
+ l = re.sub(r'(?<=\s)---(?=\s)', r'- - -', l)
172
+ l = re.sub(r'(?<=\s)…(?=\s)', r'. . .', l)
173
+ l = re.sub(r'(?<=\s)\\ldots(?=\s)', r'. . .', l)
174
+ l = re.sub(r'(?<=\s)\\hdots(?=\s)', r'. . .', l)
175
+ l = re.sub(r'(?<=\s)\\cdots(?=\s)', r'. . .', l)
176
+ l = re.sub(r'(?<=\s)\\dddot(?=\s)', r'. . .', l)
177
+ l = re.sub(r'(?<=\s)\\dots(?=\s)', r'. . .', l)
178
+ l = re.sub(r'(?<=\s)\\dotsc(?=\s)', r'. . .', l)
179
+ l = re.sub(r'(?<=\s)\\dotsi(?=\s)', r'. . .', l)
180
+ l = re.sub(r'(?<=\s)\\dotsm(?=\s)', r'. . .', l)
181
+ l = re.sub(r'(?<=\s)\\dotso(?=\s)', r'. . .', l)
182
+ l = re.sub(r'(?<=\s)\\dotsb(?=\s)', r'. . .', l)
183
+ l = re.sub(r'(?<=\s)\\mathellipsis(?=\s)', r'. . .', l)
184
+ l = re.sub(r'(?<=\s)\\ex(?=\s)', r'\\mathrm { e x }', l)
185
+ l = re.sub(r'(?<=\s)\\ln(?=\s)', r'\\mathrm { l n }', l)
186
+ l = re.sub(r'(?<=\s)\\lg(?=\s)', r'\\mathrm { l g }', l)
187
+ l = re.sub(r'(?<=\s)\\cot(?=\s)', r'\\mathrm { c o t }', l)
188
+ l = re.sub(r'(?<=\s)\\mod(?=\s)', r'\\mathrm { m o d }', l)
189
+ l = re.sub(r'(?<=\s)\\bmod(?=\s)', r'\\mathrm { m o d }', l)
190
+ l = re.sub(r'(?<=\s)\\pmod(?=\s)', r'\\mathrm { m o d }', l) # \pmod 其实和mod不一样,但是不太好处理,暂时替换为\mod
191
+ l = re.sub(r'(?<=\s)\\min(?=\s)', r'\\mathrm { m i n }', l)
192
+ l = re.sub(r'(?<=\s)\\max(?=\s)', r'\\mathrm { m a x }', l)
193
+ l = re.sub(r'(?<=\s)\\ker(?=\s)', r'\\mathrm { k e r }', l)
194
+ l = re.sub(r'(?<=\s)\\hom(?=\s)', r'\\mathrm { h o m }', l)
195
+ l = re.sub(r'(?<=\s)\\sec(?=\s)', r'\\mathrm { s e c }', l)
196
+ l = re.sub(r'(?<=\s)\\scs(?=\s)', r'\\mathrm { s c s }', l)
197
+ l = re.sub(r'(?<=\s)\\csc(?=\s)', r'\\mathrm { c s c }', l)
198
+ l = re.sub(r'(?<=\s)\\deg(?=\s)', r'\\mathrm { d e g }', l)
199
+ l = re.sub(r'(?<=\s)\\arg(?=\s)', r'\\mathrm { a r g }', l)
200
+ l = re.sub(r'(?<=\s)\\log(?=\s)', r'\\mathrm { l o g }', l)
201
+ l = re.sub(r'(?<=\s)\\dim(?=\s)', r'\\mathrm { d i m }', l)
202
+ l = re.sub(r'(?<=\s)\\exp(?=\s)', r'\\mathrm { e x p }', l)
203
+ l = re.sub(r'(?<=\s)\\sin(?=\s)', r'\\mathrm { s i n }', l)
204
+ l = re.sub(r'(?<=\s)\\cos(?=\s)', r'\\mathrm { c o s }', l)
205
+ l = re.sub(r'(?<=\s)\\tan(?=\s)', r'\\mathrm { t a n }', l)
206
+ l = re.sub(r'(?<=\s)\\tanh(?=\s)', r'\\mathrm { t a n h }', l)
207
+ l = re.sub(r'(?<=\s)\\cosh(?=\s)', r'\\mathrm { c o s h }', l)
208
+ l = re.sub(r'(?<=\s)\\sinh(?=\s)', r'\\mathrm { s i n h }', l)
209
+ l = re.sub(r'(?<=\s)\\coth(?=\s)', r'\\mathrm { c o t h }', l)
210
+ l = re.sub(r'(?<=\s)\\arcsin(?=\s)', r'\\mathrm { a r c s i n }', l)
211
+ l = re.sub(r'(?<=\s)\\arccos(?=\s)', r'\\mathrm { a r c c o s }', l)
212
+ l = re.sub(r'(?<=\s)\\arctan(?=\s)', r'\\mathrm { a r c t a n }', l)
213
+
214
+ # ** token such as \string xxx should be one token
215
+ pattern = r'\\string [^ ]+ '
216
+ old_token = re.findall(pattern, l, re.DOTALL)
217
+ new_token = [item.replace(" ", "") for item in old_token]
218
+ for bef, aft in zip(old_token, new_token):
219
+ l = l.replace(bef, aft+" ")
220
+
221
  # ** token such as \big( should be one token
222
  pattern = r'\\[Bb]ig[g]?[glrm]? [(){}|\[\]] '
223
  old_token = re.findall(pattern, l, re.DOTALL)
 
280
  for bef, aft in zip(old_token, new_token):
281
  l = l.replace(bef, "{ "+aft[1:-1]+" }")
282
 
283
+ # ** \rule{1pt}{2pt} lines, shoudle be combined as one token and do not render
284
+ pattern = r'\\rule {[ .0-9a-z]+} {[ .0-9a-z]+}'
285
  old_token = re.findall(pattern, l, re.DOTALL)
286
  new_token = [item.replace(" ", "") for item in old_token]
287
  for bef, aft in zip(old_token, new_token):
288
+ l = l.replace(bef, aft)
289
 
290
  # ** \specialrule{1pt}{2pt}{2pt}, special lines, shoudle be combined as one token
291
  pattern = r'\\specialrule {[ .0-9a-z]+} {[ .0-9a-z]+} {[ .0-9a-z]+}'
modules/visual_matcher.py CHANGED
@@ -42,41 +42,67 @@ def norm_coords(x, left, right):
42
 
43
  def norm_same_token(token):
44
  special_map = {
 
 
45
  "\\cdot": ".",
 
 
46
  "\\mid": "|",
47
- "\\to": "\\rightarrow",
48
  "\\top": "T",
49
  "\\Tilde": "\\tilde",
50
- "\\cdots": "\\dots",
51
  "\\prime": "'",
52
  "\\ast": "*",
53
  "\\left<": "\\langle",
54
- "\\right>": "\\rangle"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
- if token in special_map.keys():
57
- token = special_map[token]
58
  if token.startswith('\\left') or token.startswith('\\right'):
59
- token = token.replace("\\left", "").replace("\\right", "")
 
60
  if token.startswith('\\big') or token.startswith('\\Big'):
61
  if "\\" in token[4:]:
62
  token = "\\"+token[4:].split("\\")[-1]
63
  else:
64
  token = token[-1]
65
-
66
- if token in ['\\leq', '\\geq']:
67
- return token[0:-1]
68
- if token in ['\\lVert', '\\rVert', '\\Vert']:
69
- return '\\|'
70
- if token in ['\\lvert', '\\rvert', '\\vert']:
71
- return '|'
72
- if token.endswith("rightarrow"):
73
- return "\\rightarrow"
74
- if token.endswith("leftarrow"):
75
- return "\\leftarrow"
76
  if token.startswith('\\wide'):
77
  return token.replace("wide", "")
78
  if token.startswith('\\var'):
79
- return token.replace("\\var", "")
 
 
80
  return token
81
 
82
 
 
42
 
43
  def norm_same_token(token):
44
  special_map = {
45
+ "\\dot": ".",
46
+ "\\Dot": ".",
47
  "\\cdot": ".",
48
+ "\\cdotp": ".",
49
+ "\\ldotp": ".",
50
  "\\mid": "|",
51
+ "\\rightarrow": "\\to",
52
  "\\top": "T",
53
  "\\Tilde": "\\tilde",
 
54
  "\\prime": "'",
55
  "\\ast": "*",
56
  "\\left<": "\\langle",
57
+ "\\right>": "\\rangle",
58
+ "\\lbrace": "\{",
59
+ "\\rbrace": "\}",
60
+ "\\lbrack": "[",
61
+ "\\rbrack": "]",
62
+ "\\blackslash": "/",
63
+ "\\slash": "/",
64
+ "\\leq": "\\le",
65
+ "\\geq": "\\ge",
66
+ "\\neq": "\\ne",
67
+ "\\Vert": "\\|",
68
+ "\\lVert": "\\|",
69
+ "\\rVert": "\\|",
70
+ "\\vert": "|",
71
+ "\\lvert": "|",
72
+ "\\rvert": "|",
73
+ "\\colon": ":",
74
+ "\\Ddot": "\\ddot",
75
+ "\\Bar": "\\bar",
76
+ "\\Vec": "\\vec",
77
+ "\\parallel": "\\|",
78
+ "\\dag": "\\dagger",
79
+ "\\ddag": "\\ddagger",
80
+ "\\textlangle": "<",
81
+ "\\textrangle": ">",
82
+ "\\textgreater": ">",
83
+ "\\textless": "<",
84
+ "\\textbackslash": "n",
85
+ "\\textunderscore": "_",
86
+ "\\=": "_",
87
+ "\\neg": "\\lnot",
88
+ "\\neq": "\\not=",
89
  }
 
 
90
  if token.startswith('\\left') or token.startswith('\\right'):
91
+ if "arrow" not in token and "<" not in token and ">" not in token and "harpoon" not in token:
92
+ token = token.replace("\\left", "").replace("\\right", "")
93
  if token.startswith('\\big') or token.startswith('\\Big'):
94
  if "\\" in token[4:]:
95
  token = "\\"+token[4:].split("\\")[-1]
96
  else:
97
  token = token[-1]
98
+ if token in special_map.keys():
99
+ token = special_map[token]
 
 
 
 
 
 
 
 
 
100
  if token.startswith('\\wide'):
101
  return token.replace("wide", "")
102
  if token.startswith('\\var'):
103
+ return token.replace("var", "")
104
+ if token.startswith('\\string'):
105
+ return token.replace("\\string", "")
106
  return token
107
 
108