""" Copyright (c) 2025 Bytedance Ltd. and/or its affiliates SPDX-License-Identifier: MIT """ import re import base64 from typing import List, Dict, Any, Optional """ Example input: [ {"label": "tab", "bbox": [0.176, 0.74, 0.824, 0.82], "text": "
HellaSwagObqaWinoGrandeARC-cARC-eboolqpiqaAvg
OPT-1.3B53.6533.4059.5929.4450.8060.8372.3651.44
Pythia-1.0B47.1631.4053.4327.0548.9957.8369.2148.30
Pythia-1.4B52.0133.2057.3828.5054.0063.2770.9551.33
TinyLlama-1.1B59.2036.0059.1230.1055.2557.8373.2952.99
", "reading_order": 6}, {"label": "cap", "bbox": [0.28, 0.729, 0.711, 0.74], "text": "Table 2: Zero-shot performance on commonsense reasoning tasks", "reading_order": 7}, {"label": "para", "bbox": [0.176, 0.848, 0.826, 0.873], "text": "We of performance during training We tracked the accuracy of TinyLlama on common-\nsense reasoning benchmarks during its pre-training, as shown in Fig. 2 . Generally, the performance of", "reading_order": 8}, {"label": "fnote", "bbox": [0.176, 0.88, 0.824, 0.912], "text": "${ }^{4}$ Due to a bug in the config file, the learning rate did not decrease immediately after warmup and remained at\nthe maximum value for several steps before we fixed this.", "reading_order": 9}, {"label": "foot", "bbox": [0.496, 0.939, 0.501, 0.95], "text": "14", "reading_order": 10} ] """ def extract_table_from_html(html_string): """Extract and clean table tags from HTML string""" try: table_pattern = re.compile(r'.*?', re.DOTALL) tables = table_pattern.findall(html_string) tables = [re.sub(r']*>', '', table) for table in tables] return '\n'.join(tables) except Exception as e: print(f"extract_table_from_html error: {str(e)}") return f"
Error extracting table: {str(e)}
" class MarkdownConverter: """Convert structured recognition results to Markdown format""" def __init__(self): # Define heading levels for different section types self.heading_levels = { 'title': '#', 'sec': '##', 'sub_sec': '###' } # Define which labels need special handling self.special_labels = { 'tab', 'fig', 'title', 'sec', 'sub_sec', 'list', 'formula', 'reference', 'alg' } def try_remove_newline(self, text: str) -> str: try: # Preprocess text to handle line breaks text = text.strip() text = text.replace('-\n', '') # Handle Chinese text line breaks def is_chinese(char): return '\u4e00' <= char <= '\u9fff' lines = text.split('\n') processed_lines = [] # Process all lines except the last one for i in range(len(lines)-1): current_line = lines[i].strip() next_line = lines[i+1].strip() # Always add the current line, but determine if we need a newline if current_line: # If current line is not empty if next_line: # If next line is not empty # For Chinese text handling if is_chinese(current_line[-1]) and is_chinese(next_line[0]): processed_lines.append(current_line) else: processed_lines.append(current_line + ' ') else: # Next line is empty, add current line with newline processed_lines.append(current_line + '\n') else: # Current line is empty, add an empty line processed_lines.append('\n') # Add the last line if lines and lines[-1].strip(): processed_lines.append(lines[-1].strip()) text = ''.join(processed_lines) return text except Exception as e: print(f"try_remove_newline error: {str(e)}") return text # Return original text on error def _handle_text(self, text: str) -> str: """ Process regular text content, preserving paragraph structure """ try: if not text: return "" if text.strip().startswith("\\begin{array}") and text.strip().endswith("\\end{array}"): text = "$$" + text + "$$" elif ("_{" in text or "^{" in text or "\\" in text or "_ {" in text or "^ {" in text) and ("$" not in text) and ("\\begin" not in text): text = "$" + text + "$" # Process formulas in text before handling other text processing text = self._process_formulas_in_text(text) text = self.try_remove_newline(text) # Return processed text return text except Exception as e: print(f"_handle_text error: {str(e)}") return text # Return original text on error def _process_formulas_in_text(self, text: str) -> str: """ Process mathematical formulas in text by iteratively finding and replacing formulas. - Identify inline and block formulas - Replace newlines within formulas with \\ """ try: # Define formula delimiters and their corresponding patterns delimiters = [ ('$$', '$$'), # Block formula with $$ ('\\[', '\\]'), # Block formula with \[ \] ('$', '$'), # Inline formula with $ ('\\(', '\\)') # Inline formula with \( \) ] # Process the text by iterating through each delimiter type result = text for start_delim, end_delim in delimiters: # Create a pattern that matches from start to end delimiter # Using a custom approach to avoid issues with nested delimiters current_pos = 0 processed_parts = [] while current_pos < len(result): # Find the next start delimiter start_pos = result.find(start_delim, current_pos) if start_pos == -1: # No more formulas of this type processed_parts.append(result[current_pos:]) break # Add text before the formula processed_parts.append(result[current_pos:start_pos]) # Find the matching end delimiter end_pos = result.find(end_delim, start_pos + len(start_delim)) if end_pos == -1: # No matching end delimiter, treat as regular text processed_parts.append(result[start_pos:]) break # Extract the formula content (without delimiters) formula_content = result[start_pos + len(start_delim):end_pos] # Process the formula content - replace newlines with \\ processed_formula = formula_content.replace('\n', ' \\\\ ') # Add the processed formula with its delimiters processed_parts.append(f"{start_delim}{processed_formula}{end_delim}") # Move past this formula current_pos = end_pos + len(end_delim) # Update the result with processed text result = ''.join(processed_parts) return result except Exception as e: print(f"_process_formulas_in_text error: {str(e)}") return text # Return original text on error def _remove_newline_in_heading(self, text: str) -> str: """ Remove newline in heading """ try: # Handle Chinese text line breaks def is_chinese(char): return '\u4e00' <= char <= '\u9fff' # Check if the text contains Chinese characters if any(is_chinese(char) for char in text): return text.replace('\n', '') else: return text.replace('\n', ' ') except Exception as e: print(f"_remove_newline_in_heading error: {str(e)}") return text def _handle_heading(self, text: str, label: str) -> str: """ Convert section headings to appropriate markdown format """ try: level = self.heading_levels.get(label, '#') text = text.strip() text = self._remove_newline_in_heading(text) text = self._handle_text(text) return f"{level} {text}\n\n" except Exception as e: print(f"_handle_heading error: {str(e)}") return f"# Error processing heading: {text}\n\n" def _handle_list_item(self, text: str) -> str: """ Convert list items to markdown list format """ try: return f"- {text.strip()}\n" except Exception as e: print(f"_handle_list_item error: {str(e)}") return f"- Error processing list item: {text}\n" def _handle_figure(self, text: str, section_count: int) -> str: """ Convert base64 encoded image to markdown image syntax """ try: # Check if text is empty (fallback case) if not text.strip(): return f"![Figure {section_count}](data:image/png;base64,)\n\n" # Determine image format (assuming PNG if not specified) img_format = "png" if text.startswith("data:image/"): # Extract format from data URI img_format = text.split(";")[0].split("/")[1] elif ";" in text and "," in text: # Already in data URI format return f"![Figure {section_count}]({text})\n\n" else: # Raw base64, convert to data URI data_uri = f"data:image/{img_format};base64,{text}" return f"![Figure {section_count}]({data_uri})\n\n" except Exception as e: print(f"_handle_figure error: {str(e)}") return f"*[Error processing figure: {str(e)}]*\n\n" def _handle_table(self, text: str) -> str: """ Convert table content to markdown format """ try: markdown_content = [] if ' str: """ Process algorithm blocks with proper formatting """ try: # Remove algorithm environment tags if present text = re.sub(r'\\begin\{algorithm\}(.*?)\\end\{algorithm\}', r'\1', text, flags=re.DOTALL) text = text.replace('\\begin{algorithm}', '').replace('\\end{algorithm}', '') text = text.replace('\\begin{algorithmic}', '').replace('\\end{algorithmic}', '') # Process the algorithm text lines = text.strip().split('\n') # Check if there's a caption or label caption = "" algorithm_text = [] for line in lines: if '\\caption' in line: # Extract caption text caption_match = re.search(r'\\caption\{(.*?)\}', line) if caption_match: caption = f"**{caption_match.group(1)}**\n\n" continue elif '\\label' in line: continue # Skip label lines else: algorithm_text.append(line) # Join the algorithm text and wrap in code block formatted_text = '\n'.join(algorithm_text) # Return the formatted algorithm with caption return f"{caption}```\n{formatted_text}\n```\n\n" except Exception as e: print(f"_handle_algorithm error: {str(e)}") return f"*[Error processing algorithm: {str(e)}]*\n\n{text}\n\n" def _handle_formula(self, text: str) -> str: """ Handle formula-specific content """ try: # Process the formula content processed_text = self._process_formulas_in_text(text) # For formula blocks, ensure they're properly formatted in markdown if '$$' not in processed_text and '\\[' not in processed_text: # If no block formula delimiters are present, wrap in $$ for block formula processed_text = f'$${processed_text}$$' return f"{processed_text}\n\n" except Exception as e: print(f"_handle_formula error: {str(e)}") return f"*[Error processing formula: {str(e)}]*\n\n" def convert(self, recognition_results: List[Dict[str, Any]]) -> str: """ Convert recognition results to markdown format """ try: markdown_content = [] for section_count, result in enumerate(recognition_results): try: label = result.get('label', '') text = result.get('text', '').strip() # 处理图片,即使文本为空也要处理 if label == 'fig': markdown_content.append(self._handle_figure(text, section_count)) continue # Skip empty text for non-figure elements if not text: continue # Handle different content types if label in {'title', 'sec', 'sub_sec'}: markdown_content.append(self._handle_heading(text, label)) elif label == 'list': markdown_content.append(self._handle_list_item(text)) elif label == 'tab': markdown_content.append(self._handle_table(text)) elif label == 'alg': markdown_content.append(self._handle_algorithm(text)) elif label == 'formula': markdown_content.append(self._handle_formula(text)) elif label not in self.special_labels: # Handle regular text (paragraphs, etc.) processed_text = self._handle_text(text) markdown_content.append(f"{processed_text}\n\n") except Exception as e: print(f"Error processing item {section_count}: {str(e)}") # Add a placeholder for the failed item markdown_content.append(f"*[Error processing content]*\n\n") # Join all content and apply post-processing result = ''.join(markdown_content) return self._post_process(result) except Exception as e: print(f"convert error: {str(e)}") return f"Error generating markdown content: {str(e)}" def _post_process(self, markdown_content: str) -> str: """ Apply post-processing fixes to the generated markdown content """ try: # Handle author information author_pattern = re.compile(r'\\author\{(.*?)\}', re.DOTALL) def process_author_match(match): # Extract author content author_content = match.group(1) # Process the author content return self._handle_text(author_content) # Replace \author{...} with processed content markdown_content = author_pattern.sub(process_author_match, markdown_content) # Handle special case where author is inside math environment math_author_pattern = re.compile(r'\$(\\author\{.*?\})\$', re.DOTALL) match = math_author_pattern.search(markdown_content) if match: # Extract the author command author_cmd = match.group(1) # Extract content from author command author_content_match = re.search(r'\\author\{(.*?)\}', author_cmd, re.DOTALL) if author_content_match: # Get author content and process it author_content = author_content_match.group(1) processed_content = self._handle_text(author_content) # Replace the entire $\author{...}$ block with processed content markdown_content = markdown_content.replace(match.group(0), processed_content) # Replace LaTeX abstract environment with plain text markdown_content = re.sub(r'\\begin\{abstract\}(.*?)\\end\{abstract\}', r'**Abstract** \1', markdown_content, flags=re.DOTALL) # Replace standalone \begin{abstract} (without matching end) markdown_content = re.sub(r'\\begin\{abstract\}', r'**Abstract**', markdown_content) # Replace LaTeX equation numbers with tag format, handling cases with extra backslashes markdown_content = re.sub(r'\\eqno\{\((.*?)\)\}', r'\\tag{\1}', markdown_content) # Find the starting tag of the formula markdown_content = markdown_content.replace("\[ \\\\", "$$ \\\\") # Find the ending tag of the formula (ensure this is the only ending tag) markdown_content = markdown_content.replace("\\\\ \]", "\\\\ $$") # Fix other common LaTeX issues replacements = [ # Fix spacing issues in subscripts and superscripts (r'_ {', r'_{'), (r'^ {', r'^{'), # Fix potential issues with multiple consecutive newlines (r'\n{3,}', r'\n\n') ] for old, new in replacements: markdown_content = re.sub(old, new, markdown_content) return markdown_content except Exception as e: print(f"_post_process error: {str(e)}") return markdown_content # Return original content if post-processing fails