Spaces:

jiangjiechen
/

tiktoken_count

Running

App Files Files Community

jiangjiechen commited on 11 days ago

Commit

54f08eb

1 Parent(s): b6d36be

update layout

Browse files

Files changed (1) hide show

app.py +367 -41

app.py CHANGED Viewed

@@ -1,58 +1,384 @@
 import gradio as gr
 import tiktoken
 import json
-def count_tokens(text):
-    """
-    计算输入文本中的 token 数量，并根据用户选择格式化文本。
-    Args:
-        text (str): 输入文本。
-        use_markdown (bool): 是否使用 Markdown/LaTeX 格式输出。
     Returns:
-        tuple: 返回 token 数量和格式化后的文本。
     """
     encoding = tiktoken.encoding_for_model("gpt-4")
     tokens = encoding.encode(text)
     try:
         parsed_json = json.loads(text)
-        text = json.dumps(parsed_json, indent=4, ensure_ascii=False)
     except json.JSONDecodeError:
-        pass
-    text = text.replace("\\n", "\n")
-    formatted_text = text
-    return len(tokens), gr.update(value=formatted_text)
-# 定义 Gradio 接口
-iface = gr.Interface(
-    fn=count_tokens,
-    inputs=[
-        gr.Textbox(
-            lines=10,
-            max_lines=1000000,
-            placeholder="Enter your text here..."
-        ),
-        # gr.Checkbox(label="使用 Markdown/LaTeX 格式输出", value=True)  # 格��选择开关
-    ],
-    outputs=[
-        "number",
-        gr.Markdown(label="Beautified Text")
-    ],
-    title="Token Counter with tiktoken",
-    description="Enter text below to calculate the number of tokens using the tiktoken library. Supports LaTeX formulas using $ for inline and $$ for block formulas.",
-    examples=[
-        ["这是一个行内公式示例：$E=mc^2$"],
-        ["这是一个块级公式示例：$$\\sum_{i=1}^n i = \\frac{n(n+1)}{2}$$"],
-        ["这是混合示例：\n行内公式：$\\alpha + \\beta$\n块级公式：$$\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$"],
-        ["普通文本示例：Hello, how are you doing today?"],
-    ],
-    theme="default"
-)
 # 启动应用
 if __name__ == "__main__":

 import gradio as gr
 import tiktoken
 import json
+import re
+def highlight_json_keys(json_str):
+    """为JSON字符串中的键添加高亮显示"""
+    def replace_key(match):
+        key = match.group(1)
+        return f'<span style="color: #0066cc; font-weight: bold;">"{key}"</span>:'
+    # 匹配JSON键的正则表达式
+    key_pattern = r'"([^"]+)"(?=\s*:)'
+    highlighted = re.sub(key_pattern, replace_key, json_str)
+    return highlighted
+def normalize_latex(text):
+    """标准化LaTeX公式格式，支持多种标记形式"""
+    # 将 \( \) 转换为 $ $ (行内公式，不跨行)
+    # text = re.sub(r'\\\((.*?)\\\)', r'$\1$', text)
+    # # 将 \[ \] 转换为 $$ $$ (块级公式，允许跨行)
+    # text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', text, flags=re.DOTALL)
+    return text
+def format_json_with_syntax_highlighting(json_obj, indent=0):
+    """格式化JSON并添加语法高亮"""
+    def format_value(value, current_indent):
+        if isinstance(value, dict):
+            if not value:
+                return "{}"
+            items = []
+            for k, v in value.items():
+                key_str = f'<br><span style="color: #0066cc; font-weight: bold; background-color: #f0f8ff; padding: 2px 4px; border-radius: 3px; margin: 2px 0; display: inline-block;">"{k}"</span>'
+                value_str = format_value(v, current_indent + 2)
+                items.append(f"{key_str}: {value_str}")
+            return "{" + "".join(items) + "<br>}"
+        elif isinstance(value, list):
+            if not value:
+                return "[]"
+            items = []
+            for item in value:
+                items.append(f"<br>{format_value(item, current_indent + 2)}")
+            return "[" + "".join(items) + "<br>]"
+        elif isinstance(value, str):
+            return f'<span style="color: #008000;">"{value}"</span>'
+        elif isinstance(value, bool):
+            return f'<span style="color: #0000ff;">{str(value).lower()}</span>'
+        elif isinstance(value, (int, float)):
+            return f'<span style="color: #ff0000;">{value}</span>'
+        elif value is None:
+            return '<span style="color: #808080;">null</span>'
+        else:
+            return str(value)
+    return format_value(json_obj, indent)
+def count_tokens_and_format(text):
+    """
+    计算输入文本中的 token 数量，并返回不同格式的输出。
     Returns:
+        tuple: (token_count, html_output, markdown_output)
     """
+    if not text.strip():
+        return 0, "", ""
     encoding = tiktoken.encoding_for_model("gpt-4")
     tokens = encoding.encode(text)
+    # 先尝试解析JSON (原始文本)
+    is_json = False
+    json_text = text
+    text = text.replace('\\\\', '\\')
     try:
         parsed_json = json.loads(text)
+        # JSON使用HTML格式以保持语法高亮
+        json_formatted = format_json_with_syntax_highlighting(parsed_json)
+        html_output = f'<div class="json-container">{json_formatted}</div>'
+        # JSON的markdown版本（无高亮）
+        markdown_output = f"```json\n{json.dumps(parsed_json, indent=2, ensure_ascii=False)}\n```"
+        is_json = True
+    except json.JSONDecodeError:
+        # 普通文本处理LaTeX - 只对非JSON文本处理换行符
+        processed_text = text.replace("\\n", "\n")
+        formatted_text = normalize_latex(processed_text)
+        if len(text) > 1000 or text.count('\n') > 20:
+            html_output = f'<div class="long-text-container">{formatted_text}</div>'
+        else:
+            html_output = f'<div class="text-container">{formatted_text}</div>'
+        # Markdown输出保持LaTeX格式
+        markdown_output = formatted_text
+    return len(tokens), html_output, markdown_output
+def count_tokens(text):
+    """主函数：返回token数量和格式化输出"""
+    tokens, html_out, markdown_out = count_tokens_and_format(text)
+    # 检查是否包含LaTeX或是JSON
+    try:
+        json.loads(text.replace("\\n", "\n"))
+        # 是JSON，返回HTML版本
+        return tokens, html_out
     except json.JSONDecodeError:
+        # 检查是否包含LaTeX
+        if '$' in text or '\\(' in text or '\\[' in text:
+            # 包含LaTeX，返回markdown版本
+            return tokens, markdown_out
+        else:
+            # 普通文本，返回HTML版本获得更好的样式
+            return tokens, html_out
+def create_custom_css():
+    """创建自定义CSS样式"""
+    return """
+    .gradio-container {
+        max-width: 100% !important;
+        width: 100% !important;
+        margin: 0 auto !important;
+        padding: 20px !important;
+        font-family: Arial, sans-serif !important;
+    }
+    @media (min-width: 768px) {
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+    }
+    .input-container textarea {
+        resize: vertical !important;
+        min-height: 200px !important;
+        max-height: 500px !important;
+    }
+    .output-container {
+        min-height: 200px !important;
+        max-height: 600px !important;
+        overflow-y: auto !important;
+    }
+    .gradio-row {
+        display: flex !important;
+        flex-wrap: wrap !important;
+        gap: 20px !important;
+        align-items: flex-start !important;
+    }
+    .gradio-column {
+        flex: 1 !important;
+        min-width: 300px !important;
+    }
+    /* 响应式设计 */
+    @media (max-width: 768px) {
+        .gradio-column {
+            min-width: 100% !important;
+        }
+        .gradio-container {
+            padding: 10px !important;
+        }
+    }
+    /* 长文本容器样式优化 */
+    .long-text-container {
+        max-height: 500px !important;
+        overflow-y: auto !important;
+        padding: 15px !important;
+        border: 1px solid #ddd !important;
+        border-radius: 8px !important;
+        background-color: #f8f9fa !important;
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace !important;
+        font-size: 14px !important;
+        line-height: 1.5 !important;
+        white-space: pre-wrap !important;
+        word-wrap: break-word !important;
+        box-sizing: border-box !important;
+    }
+    .long-text-container::-webkit-scrollbar {
+        width: 8px !important;
+    }
+    .long-text-container::-webkit-scrollbar-track {
+        background: #f1f1f1 !important;
+        border-radius: 4px !important;
+    }
+    .long-text-container::-webkit-scrollbar-thumb {
+        background: #888 !important;
+        border-radius: 4px !important;
+    }
+    .long-text-container::-webkit-scrollbar-thumb:hover {
+        background: #555 !important;
+    }
+    /* 普通文本容器样式 */
+    .text-container {
+        padding: 15px !important;
+        border: 1px solid #ddd !important;
+        border-radius: 8px !important;
+        background-color: #fff !important;
+        font-family: Arial, sans-serif !important;
+        font-size: 14px !important;
+        line-height: 1.6 !important;
+        white-space: pre-wrap !important;
+        word-wrap: break-word !important;
+        box-sizing: border-box !important;
+    }
+    /* JSON容器样式 */
+    .json-container {
+        max-height: 500px !important;
+        overflow-y: auto !important;
+        padding: 15px !important;
+        border: 1px solid #ddd !important;
+        border-radius: 8px !important;
+        background-color: #f8f9fa !important;
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace !important;
+        font-size: 14px !important;
+        line-height: 1.8 !important;
+        white-space: normal !important;
+        word-wrap: break-word !important;
+        box-sizing: border-box !important;
+    }
+    .json-container::-webkit-scrollbar {
+        width: 8px !important;
+    }
+    .json-container::-webkit-scrollbar-track {
+        background: #f1f1f1 !important;
+        border-radius: 4px !important;
+    }
+    .json-container::-webkit-scrollbar-thumb {
+        background: #888 !important;
+        border-radius: 4px !important;
+    }
+    .json-container::-webkit-scrollbar-thumb:hover {
+        background: #555 !important;
+    }
+    """
+# 创建处理函数
+def process_text(text):
+    """处理文本并返回适当的输出格式"""
+    if not text.strip():
+        return 0, "", ""
+    tokens, html_out, markdown_out = count_tokens_and_format(text)
+    # 检查内容类型
+    try:
+        json.loads(text)
+        # JSON: 显示HTML，隐藏Markdown
+        return tokens, html_out, ""
+    except json.JSONDecodeError:
+        # 检查是否包含LaTeX
+        if '$' in text or '\\(' in text or '\\[' in text:
+            # LaTeX: 显示Markdown，隐藏HTML
+            return tokens, "", markdown_out
+        else:
+            # 普通文本: 显示HTML，隐藏Markdown
+            return tokens, html_out, ""
+# 使用 Blocks 创建更灵活的布局
+with gr.Blocks(
+    theme="soft",
+    css=create_custom_css(),
+    title="🔢 智能 Token 计数器"
+) as iface:
+    gr.HTML("""
+    <div style="text-align: center; margin: 10px 0;">
+        <h1 style="color: #333; margin-bottom: 10px;">🔢 智能 Token 计数器</h1>
+        <p style="font-size: 16px; color: #666; margin-bottom: 20px;">
+            使用 tiktoken 库计算文本的 token 数量，支持：
+        </p>
+    </div>
+    """)
+    with gr.Row():
+        # 左列：输入和token计数
+        with gr.Column(scale=1):
+            input_text = gr.Textbox(
+                lines=15,
+                max_lines=50,
+                placeholder="""输入您的文本，支持JSON格式和LaTeX公式...
+示例:
+• JSON: {"key": "value"}
+• LaTeX行内公式: \\(E=mc^2\\) 或 $E=mc^2$
+• LaTeX块级公式: \\[\\sum_{i=1}^n i\\] 或 $$\\sum_{i=1}^n i$$""",
+                label="输入文本",
+                show_label=True,
+                container=True,
+                elem_classes=["input-container"]
+            )
+            token_count = gr.Number(
+                label="Token 数量",
+                precision=0,
+                show_label=True,
+                interactive=False
+            )
+        # 右列：渲染输出
+        with gr.Column(scale=1):
+            # HTML 输出（用于JSON和普通文本）
+            html_output = gr.HTML(
+                label="格式化文本",
+                show_label=True,
+                elem_classes=["output-container"],
+                visible=True
+            )
+            # Markdown 输出（用于LaTeX）
+            markdown_output = gr.Markdown(
+                label="格式化文本",
+                show_label=True,
+                elem_classes=["output-container"],
+                visible=False,
+                latex_delimiters=[
+                    {"left": "$", "right": "$", "display": False},
+                    {"left": "$$", "right": "$$", "display": True},
+                    {"left": "\\(", "right": "\\)", "display": False},
+                    {"left": "\\[", "right": "\\]", "display": True}
+                ]
+            )
+    # 示例区域
+    gr.Examples(
+        examples=[
+            ['{"name": "Claude", "version": "3.5", "features": ["JSON解析", "LaTeX支持", "智能格式化"]}'],
+            ["这是一个行内公式示例：\\(E=mc^2\\) 和另一个 $F=ma$"],
+            ["这是一个块级公式示例：\\[\\sum_{i=1}^n i = \\frac{n(n+1)}{2}\\]"],
+            ["混合示例包含文本、JSON和LaTeX：\n这里是普通文本。\n\n这里是JSON：{\"message\": \"Hello World\", \"count\": 42}\n\n这里是公式：\\(\\alpha + \\beta = \\gamma\\)\n\n块级公式：\\[\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}\\]"],
+        ],
+        inputs=input_text,
+        label="点击示例快速体验"
+    )
+    # 动态更新函数
+    def update_outputs(text):
+        tokens, html_content, markdown_content = process_text(text)
+        if html_content and not markdown_content:
+            # 显示HTML，隐藏Markdown
+            return (
+                tokens,
+                gr.update(value=html_content, visible=True),
+                gr.update(value="", visible=False)
+            )
+        elif markdown_content and not html_content:
+            # 显示Markdown，隐藏HTML
+            return (
+                tokens,
+                gr.update(value="", visible=False),
+                gr.update(value=markdown_content, visible=True)
+            )
+        else:
+            # 默认显示HTML
+            return (
+                tokens,
+                gr.update(value="", visible=True),
+                gr.update(value="", visible=False)
+            )
+    # 绑定事件
+    input_text.change(
+        fn=update_outputs,
+        inputs=input_text,
+        outputs=[token_count, html_output, markdown_output]
+    )
 # 启动应用
 if __name__ == "__main__":