File size: 20,588 Bytes
dfb1341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab40c3e
 
 
 
dfb1341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab40c3e
 
 
 
 
 
dfb1341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
""" 
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
SPDX-License-Identifier: MIT
"""

import re
import base64
from typing import List, Dict, Any, Optional


"""
    Example input:
    [
        {"label": "tab", "bbox": [0.176, 0.74, 0.824, 0.82], "text": "<table><tr><td></td><td>HellaSwag</td><td>Obqa</td><td>WinoGrande</td><td>ARC-c</td><td>ARC-e</td><td>boolq</td><td>piqa</td><td>Avg</td></tr><tr><td>OPT-1.3B</td><td>53.65</td><td>33.40</td><td>59.59</td><td>29.44</td><td>50.80</td><td>60.83</td><td>72.36</td><td>51.44</td></tr><tr><td>Pythia-1.0B</td><td>47.16</td><td>31.40</td><td>53.43</td><td>27.05</td><td>48.99</td><td>57.83</td><td>69.21</td><td>48.30</td></tr><tr><td>Pythia-1.4B</td><td>52.01</td><td>33.20</td><td>57.38</td><td>28.50</td><td>54.00</td><td>63.27</td><td>70.95</td><td>51.33</td></tr><tr><td>TinyLlama-1.1B</td><td>59.20</td><td>36.00</td><td>59.12</td><td>30.10</td><td>55.25</td><td>57.83</td><td>73.29</td><td>52.99</td></tr></table>", "reading_order": 6},
        {"label": "cap", "bbox": [0.28, 0.729, 0.711, 0.74], "text": "Table 2: Zero-shot performance on commonsense reasoning tasks", "reading_order": 7}, 
        {"label": "para", "bbox": [0.176, 0.848, 0.826, 0.873], "text": "We of performance during training We tracked the accuracy of TinyLlama on common-\nsense reasoning benchmarks during its pre-training, as shown in Fig. 2 . Generally, the performance of", "reading_order": 8}, 
        {"label": "fnote", "bbox": [0.176, 0.88, 0.824, 0.912], "text": "${ }^{4}$ Due to a bug in the config file, the learning rate did not decrease immediately after warmup and remained at\nthe maximum value for several steps before we fixed this.", "reading_order": 9}, 
        {"label": "foot", "bbox": [0.496, 0.939, 0.501, 0.95], "text": "14", "reading_order": 10}
    ]
"""


def extract_table_from_html(html_string):
    """Extract and clean table tags from HTML string"""
    try:
        table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
        tables = table_pattern.findall(html_string)
        tables = [re.sub(r'<table[^>]*>', '<table>', table) for table in tables]
        return '\n'.join(tables)
    except Exception as e:
        print(f"extract_table_from_html error: {str(e)}")
        return f"<table><tr><td>Error extracting table: {str(e)}</td></tr></table>"


class MarkdownConverter:
    """Convert structured recognition results to Markdown format"""
    
    def __init__(self):
        # Define heading levels for different section types
        self.heading_levels = {
            'title': '#',
            'sec': '##',
            'sub_sec': '###'
        }
        
        # Define which labels need special handling
        self.special_labels = {
            'tab', 'fig', 'title', 'sec', 'sub_sec', 
            'list', 'formula', 'reference', 'alg'
        }
    
    def try_remove_newline(self, text: str) -> str:
        try:
            # Preprocess text to handle line breaks
            text = text.strip()
            text = text.replace('-\n', '')
            
            # Handle Chinese text line breaks
            def is_chinese(char):
                return '\u4e00' <= char <= '\u9fff'

            lines = text.split('\n')
            processed_lines = []
            
            # Process all lines except the last one
            for i in range(len(lines)-1):
                current_line = lines[i].strip()
                next_line = lines[i+1].strip()
                
                # Always add the current line, but determine if we need a newline
                if current_line:  # If current line is not empty
                    if next_line:  # If next line is not empty
                        # For Chinese text handling
                        if is_chinese(current_line[-1]) and is_chinese(next_line[0]):
                            processed_lines.append(current_line)
                        else:
                            processed_lines.append(current_line + ' ')
                    else:
                        # Next line is empty, add current line with newline
                        processed_lines.append(current_line + '\n')
                else:
                    # Current line is empty, add an empty line
                    processed_lines.append('\n')
            
            # Add the last line
            if lines and lines[-1].strip():
                processed_lines.append(lines[-1].strip())
            
            text = ''.join(processed_lines)
            
            return text
        except Exception as e:
            print(f"try_remove_newline error: {str(e)}")
            return text  # Return original text on error

    def _handle_text(self, text: str) -> str:
        """
        Process regular text content, preserving paragraph structure
        """
        try:
            if not text:
                return ""
            
            if text.strip().startswith("\\begin{array}") and text.strip().endswith("\\end{array}"):
                text = "$$" + text + "$$"
            elif ("_{" in text or "^{" in text or "\\" in text or "_ {" in text or "^ {" in text) and ("$" not in text) and ("\\begin" not in text):
                text = "$" + text + "$"

            # Process formulas in text before handling other text processing
            text = self._process_formulas_in_text(text)
            
            text = self.try_remove_newline(text)
            
            # Return processed text
            return text
        except Exception as e:
            print(f"_handle_text error: {str(e)}")
            return text  # Return original text on error
    
    def _process_formulas_in_text(self, text: str) -> str:
        """
        Process mathematical formulas in text by iteratively finding and replacing formulas.
        - Identify inline and block formulas
        - Replace newlines within formulas with \\
        """
        try:
            # Define formula delimiters and their corresponding patterns
            delimiters = [
                ('$$', '$$'),  # Block formula with $$
                ('\\[', '\\]'),  # Block formula with \[ \]
                ('$', '$'),  # Inline formula with $
                ('\\(', '\\)')  # Inline formula with \( \)
            ]
            
            # Process the text by iterating through each delimiter type
            result = text
            
            for start_delim, end_delim in delimiters:
                # Create a pattern that matches from start to end delimiter
                # Using a custom approach to avoid issues with nested delimiters
                current_pos = 0
                processed_parts = []
                
                while current_pos < len(result):
                    # Find the next start delimiter
                    start_pos = result.find(start_delim, current_pos)
                    if start_pos == -1:
                        # No more formulas of this type
                        processed_parts.append(result[current_pos:])
                        break
                    
                    # Add text before the formula
                    processed_parts.append(result[current_pos:start_pos])
                    
                    # Find the matching end delimiter
                    end_pos = result.find(end_delim, start_pos + len(start_delim))
                    if end_pos == -1:
                        # No matching end delimiter, treat as regular text
                        processed_parts.append(result[start_pos:])
                        break
                    
                    # Extract the formula content (without delimiters)
                    formula_content = result[start_pos + len(start_delim):end_pos]
                    
                    # Process the formula content - replace newlines with \\
                    processed_formula = formula_content.replace('\n', ' \\\\ ')
                    
                    # Add the processed formula with its delimiters
                    processed_parts.append(f"{start_delim}{processed_formula}{end_delim}")
                    
                    # Move past this formula
                    current_pos = end_pos + len(end_delim)
                
                # Update the result with processed text
                result = ''.join(processed_parts)
            return result
        except Exception as e:
            print(f"_process_formulas_in_text error: {str(e)}")
            return text  # Return original text on error
    
    def _remove_newline_in_heading(self, text: str) -> str:
        """
        Remove newline in heading
        """
        try:
            # Handle Chinese text line breaks
            def is_chinese(char):
                return '\u4e00' <= char <= '\u9fff'
            
            # Check if the text contains Chinese characters
            if any(is_chinese(char) for char in text):
                return text.replace('\n', '')
            else:
                return text.replace('\n', ' ')

        except Exception as e:
            print(f"_remove_newline_in_heading error: {str(e)}")
            return text
    
    def _handle_heading(self, text: str, label: str) -> str:
        """
        Convert section headings to appropriate markdown format
        """
        try:
            level = self.heading_levels.get(label, '#')
            text = text.strip()
            text = self._remove_newline_in_heading(text)
            text = self._handle_text(text)
            return f"{level} {text}\n\n"
        except Exception as e:
            print(f"_handle_heading error: {str(e)}")
            return f"# Error processing heading: {text}\n\n"
    
    def _handle_list_item(self, text: str) -> str:
        """
        Convert list items to markdown list format
        """
        try:
            return f"- {text.strip()}\n"
        except Exception as e:
            print(f"_handle_list_item error: {str(e)}")
            return f"- Error processing list item: {text}\n"
    
    def _handle_figure(self, text: str, section_count: int) -> str:
        """
        Convert base64 encoded image to markdown image syntax
        """
        try:
            # Check if text is empty (fallback case)
            if not text.strip():
                return f"![Figure {section_count}](data:image/png;base64,)\n\n"
            
            # Determine image format (assuming PNG if not specified)
            img_format = "png"
            if text.startswith("data:image/"):
                # Extract format from data URI
                img_format = text.split(";")[0].split("/")[1]
            elif ";" in text and "," in text:
                # Already in data URI format
                return f"![Figure {section_count}]({text})\n\n"
            else:
                # Raw base64, convert to data URI
                data_uri = f"data:image/{img_format};base64,{text}"
                return f"![Figure {section_count}]({data_uri})\n\n"
        except Exception as e:
            print(f"_handle_figure error: {str(e)}")
            return f"*[Error processing figure: {str(e)}]*\n\n"

    def _handle_table(self, text: str) -> str:
        """
        Convert table content to markdown format
        """
        try:
            markdown_content = []
            if '<table' in text.lower() or '<tr' in text.lower():
                markdown_table = extract_table_from_html(text)
                markdown_content.append(markdown_table + "\n")
            else:
                table_lines = text.split('\n')
                if table_lines:
                    col_count = len(table_lines[0].split()) if table_lines[0] else 1
                    header = '| ' + ' | '.join(table_lines[0].split()) + ' |'
                    markdown_content.append(header)
                    markdown_content.append('| ' + ' | '.join(['---'] * col_count) + ' |')
                    for line in table_lines[1:]:
                        cells = line.split()
                        while len(cells) < col_count:
                            cells.append('')
                        markdown_content.append('| ' + ' | '.join(cells) + ' |')
            return '\n'.join(markdown_content) + '\n\n'
        except Exception as e:
            print(f"_handle_table error: {str(e)}")
            return f"*[Error processing table: {str(e)}]*\n\n"

    def _handle_algorithm(self, text: str) -> str:
        """
        Process algorithm blocks with proper formatting
        """
        try:
            # Remove algorithm environment tags if present
            text = re.sub(r'\\begin\{algorithm\}(.*?)\\end\{algorithm\}', r'\1', text, flags=re.DOTALL)
            text = text.replace('\\begin{algorithm}', '').replace('\\end{algorithm}', '')
            text = text.replace('\\begin{algorithmic}', '').replace('\\end{algorithmic}', '')
            
            # Process the algorithm text
            lines = text.strip().split('\n')
            
            # Check if there's a caption or label
            caption = ""
            algorithm_text = []
            
            for line in lines:
                if '\\caption' in line:
                    # Extract caption text
                    caption_match = re.search(r'\\caption\{(.*?)\}', line)
                    if caption_match:
                        caption = f"**{caption_match.group(1)}**\n\n"
                    continue
                elif '\\label' in line:
                    continue  # Skip label lines
                else:
                    algorithm_text.append(line)
            
            # Join the algorithm text and wrap in code block
            formatted_text = '\n'.join(algorithm_text)
            
            # Return the formatted algorithm with caption
            return f"{caption}```\n{formatted_text}\n```\n\n"
        except Exception as e:
            print(f"_handle_algorithm error: {str(e)}")
            return f"*[Error processing algorithm: {str(e)}]*\n\n{text}\n\n"

    def _handle_formula(self, text: str) -> str:
        """
        Handle formula-specific content
        """
        try:
            # Process the formula content
            processed_text = self._process_formulas_in_text(text)
            
            # For formula blocks, ensure they're properly formatted in markdown
            if '$$' not in processed_text and '\\[' not in processed_text:
                # If no block formula delimiters are present, wrap in $$ for block formula
                processed_text = f'$${processed_text}$$'
            
            return f"{processed_text}\n\n"
        except Exception as e:
            print(f"_handle_formula error: {str(e)}")
            return f"*[Error processing formula: {str(e)}]*\n\n"

    def convert(self, recognition_results: List[Dict[str, Any]]) -> str:
        """
        Convert recognition results to markdown format
        """
        try:
            markdown_content = []
            
            for section_count, result in enumerate(recognition_results):
                try:
                    label = result.get('label', '')
                    text = result.get('text', '').strip()
                    
                    # 处理图片,即使文本为空也要处理
                    if label == 'fig':
                        markdown_content.append(self._handle_figure(text, section_count))
                        continue
                    
                    # Skip empty text for non-figure elements
                    if not text:
                        continue
                        
                    # Handle different content types
                    if label in {'title', 'sec', 'sub_sec'}:
                        markdown_content.append(self._handle_heading(text, label))
                    elif label == 'list':
                        markdown_content.append(self._handle_list_item(text))
                    elif label == 'tab':
                        markdown_content.append(self._handle_table(text))
                    elif label == 'alg':
                        markdown_content.append(self._handle_algorithm(text))
                    elif label == 'formula':
                        markdown_content.append(self._handle_formula(text))
                    elif label not in self.special_labels:
                        # Handle regular text (paragraphs, etc.)
                        processed_text = self._handle_text(text)
                        markdown_content.append(f"{processed_text}\n\n")
                except Exception as e:
                    print(f"Error processing item {section_count}: {str(e)}")
                    # Add a placeholder for the failed item
                    markdown_content.append(f"*[Error processing content]*\n\n")
            
            # Join all content and apply post-processing
            result = ''.join(markdown_content)
            return self._post_process(result)
        except Exception as e:
            print(f"convert error: {str(e)}")
            return f"Error generating markdown content: {str(e)}"

    def _post_process(self, markdown_content: str) -> str:
        """
        Apply post-processing fixes to the generated markdown content
        """
        try:
            # Handle author information
            author_pattern = re.compile(r'\\author\{(.*?)\}', re.DOTALL)
            
            def process_author_match(match):
                # Extract author content
                author_content = match.group(1)
                # Process the author content
                return self._handle_text(author_content)
            
            # Replace \author{...} with processed content
            markdown_content = author_pattern.sub(process_author_match, markdown_content)
            
            # Handle special case where author is inside math environment
            math_author_pattern = re.compile(r'\$(\\author\{.*?\})\$', re.DOTALL)
            match = math_author_pattern.search(markdown_content)
            if match:
                # Extract the author command
                author_cmd = match.group(1)
                # Extract content from author command
                author_content_match = re.search(r'\\author\{(.*?)\}', author_cmd, re.DOTALL)
                if author_content_match:
                    # Get author content and process it
                    author_content = author_content_match.group(1)
                    processed_content = self._handle_text(author_content)
                    # Replace the entire $\author{...}$ block with processed content
                    markdown_content = markdown_content.replace(match.group(0), processed_content)
            
            # Replace LaTeX abstract environment with plain text
            markdown_content = re.sub(r'\\begin\{abstract\}(.*?)\\end\{abstract\}', 
                                     r'**Abstract** \1', 
                                     markdown_content, 
                                     flags=re.DOTALL)
            
            # Replace standalone \begin{abstract} (without matching end)
            markdown_content = re.sub(r'\\begin\{abstract\}', 
                                     r'**Abstract**', 
                                     markdown_content)
            
            # Replace LaTeX equation numbers with tag format, handling cases with extra backslashes
            markdown_content = re.sub(r'\\eqno\{\((.*?)\)\}',
                                    r'\\tag{\1}',
                                    markdown_content)

            # Find the starting tag of the formula
            markdown_content = markdown_content.replace("\[ \\\\", "$$ \\\\")

            # Find the ending tag of the formula (ensure this is the only ending tag)
            markdown_content = markdown_content.replace("\\\\ \]", "\\\\ $$")

            # Fix other common LaTeX issues
            replacements = [
                # Fix spacing issues in subscripts and superscripts
                (r'_ {', r'_{'),
                (r'^ {', r'^{'),
                
                # Fix potential issues with multiple consecutive newlines
                (r'\n{3,}', r'\n\n')
            ]
            
            for old, new in replacements:
                markdown_content = re.sub(old, new, markdown_content)
            
            return markdown_content
        except Exception as e:
            print(f"_post_process error: {str(e)}")
            return markdown_content  # Return original content if post-processing fails