|
import base64 |
|
import io |
|
import json |
|
import os |
|
import re |
|
import tempfile |
|
import zipfile |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import requests |
|
from PIL import Image |
|
import pdf2image |
|
|
|
API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing" |
|
TOKEN = os.getenv("API_TOKEN", "c9e4aaf9634724e215690ba66a66dbdbdf3222a2") |
|
|
|
|
|
CSS = """ |
|
:root { |
|
--sand-color: #D7B4F8; |
|
--white: #ffffff; |
|
--shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
--text-color: #F5ECFD; |
|
--black:#000000; |
|
--link-hover: #F5ECFD; |
|
--content-width: 1200px; |
|
--button-color: #cbbdff; /* 新增按钮颜色变量 */ |
|
} |
|
|
|
body { |
|
display: flex; |
|
justify-content: center; |
|
background-color: var(--sand-color); |
|
color: var(--black); |
|
font-family: Arial, sans-serif; |
|
} |
|
|
|
.gradio-container { |
|
max-width: var(--content-width) !important; |
|
width: 100% !important; |
|
margin: 20px auto; |
|
padding: 20px; |
|
background-color: var(--white) !important; |
|
} |
|
|
|
/* 修改1: 优化header容器样式 */ |
|
.header-container { |
|
width: 100%; |
|
background-color: var(--text-color) !important; |
|
padding: 20px 0 10px 0; /* 减少底部padding */ |
|
margin-bottom: 20px; |
|
border-radius: 8px; |
|
} |
|
|
|
.logo-container { |
|
width: 100%; |
|
margin-bottom: 15px; /* 减少logo与按钮间距 */ |
|
text-align: center; |
|
} |
|
|
|
.logo-img { |
|
width: 100%; |
|
max-width: var(--content-width); |
|
margin: 0 auto; |
|
display: block; |
|
} |
|
|
|
/* 修改2: 优化导航按钮布局 */ |
|
.nav-buttons { |
|
display: flex; |
|
justify-content: center; |
|
gap: 20px; /* 减少按钮间距 */ |
|
margin-top: 10px; /* 减少顶部间距 */ |
|
width: 80%; /* 控制按钮区域宽度 */ |
|
margin-left: auto; |
|
margin-right: auto; |
|
} |
|
|
|
.nav-button { |
|
background-color: var(--button-color) !important; /* 修改为FAF9F6 */ |
|
color: var(--black) !important; |
|
text-decoration: none; |
|
font-weight: bold; |
|
font-size: 20px; |
|
padding: 6px 20px !important; |
|
border-radius: 24px !important; |
|
border: none !important; |
|
transition: opacity 0.2s; |
|
flex: 1; |
|
max-width: 200px; |
|
text-align: center; |
|
} |
|
|
|
.nav-button:hover { |
|
opacity: 0.8 !important; |
|
} |
|
|
|
.upload-section { |
|
width: 100%; |
|
margin: 0 auto 30px; |
|
padding: 20px; |
|
background-color: var(--text-color) !important; |
|
border-radius: 8px; |
|
box-shadow: var(--shadow); |
|
} |
|
/* 强制底部链接为白色背景 */ |
|
#component-16, /* Use via API 部分 */ |
|
#component-17, /* Settings 部分 */ |
|
#component-18 { /* Built with Gradio 部分 */ |
|
background-color: var(--white) !important; |
|
} |
|
|
|
footer .gr-panel, |
|
.gr-footer, |
|
.gr-panel:has(a[href*="api"]), /* 针对Use via API */ |
|
.gr-panel:has(a[href*="settings"]) /* 针对Settings */ { |
|
background-color: var(--white) !important; |
|
color: var(--black) !important; |
|
} |
|
|
|
|
|
/* 其他样式保持不变 */ |
|
.result-container { |
|
display: flex; |
|
gap: 20px; |
|
margin-bottom: 30px; |
|
width: 100%; |
|
} |
|
|
|
.pdf-preview { |
|
flex: 1; |
|
min-width: 0; |
|
} |
|
|
|
.markdown-result { |
|
flex: 1; |
|
min-width: 0; |
|
} |
|
|
|
.gallery-container { |
|
width: 100% !important; |
|
} |
|
|
|
.gallery-item { |
|
width: 100% !important; |
|
height: auto !important; |
|
aspect-ratio: auto !important; |
|
} |
|
|
|
button { |
|
background-color: var(--text-color) !important; |
|
color: var(--black) !important; |
|
border: none !important; |
|
border-radius: 4px; |
|
padding: 8px 16px; |
|
} |
|
button:hover { |
|
opacity: 0.8 !important; |
|
} |
|
|
|
.radio-group { |
|
margin-bottom: 15px !important; |
|
} |
|
|
|
.file-download { |
|
margin-top: 15px !important; |
|
} |
|
.loader { |
|
border: 5px solid #f3f3f3; |
|
border-top: 5px solid #3498db; |
|
border-radius: 50%; |
|
width: 50px; |
|
height: 50px; |
|
animation: spin 1s linear infinite; |
|
margin: 20px auto; |
|
} |
|
|
|
@keyframes spin { |
|
0% { transform: rotate(0deg); } |
|
100% { transform: rotate(360deg); } |
|
} |
|
|
|
.loader-container { |
|
text-align: center; |
|
margin: 20px 0; |
|
} |
|
""" |
|
|
|
def clean_markdown_text(text): |
|
if not text: |
|
return "" |
|
text = re.sub(r'<[^>]+>', '', text) |
|
text = re.sub(r'\n{3,}', '\n\n', text) |
|
return text.strip() |
|
|
|
|
|
def pdf_to_images(pdf_path): |
|
try: |
|
images = pdf2image.convert_from_path(pdf_path) |
|
return [image for image in images] |
|
except: |
|
return None |
|
|
|
|
|
def process_file(file_path, file_type): |
|
try: |
|
with open(file_path, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
file_data = base64.b64encode(file_bytes).decode("ascii") |
|
headers = { |
|
"Authorization": f"token {TOKEN}", |
|
"Content-Type": "application/json" |
|
} |
|
|
|
response = requests.post( |
|
API_URL, |
|
json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1}, |
|
headers=headers, |
|
timeout=60 |
|
) |
|
response.raise_for_status() |
|
|
|
result = response.json() |
|
layout_results = result.get("result", {}).get("layoutParsingResults", []) |
|
|
|
markdown_contents = [] |
|
clean_markdown_contents = [] |
|
for res in layout_results: |
|
markdown = res.get("markdown", {}) |
|
if isinstance(markdown, str): |
|
original = markdown |
|
elif isinstance(markdown, dict): |
|
original = markdown.get("text", "") |
|
|
|
markdown_contents.append(original) |
|
clean_markdown_contents.append(clean_markdown_text(original)) |
|
|
|
if file_type == "pdf": |
|
images = pdf_to_images(file_path) |
|
else: |
|
images = [Image.open(file_path)] |
|
|
|
return { |
|
"original_file": file_path, |
|
"markdown_contents": markdown_contents, |
|
"clean_markdown_contents": clean_markdown_contents, |
|
"pdf_images": images, |
|
"api_response": result |
|
} |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error processing file: {str(e)}") |
|
|
|
|
|
def create_zip_file(results): |
|
try: |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_filename = f"analysis_results_{timestamp}.zip" |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
zip_path = os.path.join(temp_dir, zip_filename) |
|
|
|
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: |
|
original_path = results.get("original_file", "") |
|
if original_path and Path(original_path).exists(): |
|
zipf.write(original_path, f"original/{Path(original_path).name}") |
|
|
|
markdowns = results.get("markdown_contents", []) |
|
for i, md_content in enumerate(markdowns): |
|
if md_content: |
|
zipf.writestr(f"markdown/original/markdown_{i + 1}.md", md_content) |
|
|
|
api_response = results.get("api_response", {}) |
|
zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False)) |
|
|
|
return zip_path |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error creating ZIP file: {str(e)}") |
|
|
|
|
|
def export_markdown(results): |
|
try: |
|
markdowns = results.get("markdown_contents", []) |
|
if not markdowns: |
|
raise gr.Error("No markdown content to export") |
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"original_markdown_{timestamp}.md" |
|
content = "\n\n".join(markdowns) |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
file_path = os.path.join(temp_dir, filename) |
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
return file_path |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error exporting markdown: {str(e)}") |
|
|
|
|
|
with gr.Blocks(css=CSS, title="Document Analysis System") as demo: |
|
results_state = gr.State() |
|
|
|
|
|
with gr.Column(elem_classes=["header-container"]): |
|
with gr.Column(elem_classes=["logo-container"]): |
|
gr.Image("pp-structurev3_altered.png", elem_classes=["logo-img"], show_label=False) |
|
|
|
|
|
with gr.Row(elem_classes=["nav-buttons"]): |
|
gr.Button("GitHub", link="https://github.com/PaddlePaddle/PaddleOCR", elem_classes=["nav-button"]) |
|
gr.Button("Homepage", link="https://paddleocr.ai", elem_classes=["nav-button"]) |
|
|
|
with gr.Column(elem_classes=["upload-section"]): |
|
file_type = gr.Radio( |
|
["pdf", "image"], |
|
label="File type", |
|
value="pdf", |
|
interactive=True |
|
) |
|
file_input = gr.File( |
|
label="Upload document", |
|
file_types=[".pdf", ".jpg", ".jpeg", ".png"], |
|
type="filepath" |
|
) |
|
process_btn = gr.Button("Analyze document", variant="primary") |
|
|
|
loading_spinner = gr.Column( |
|
visible=False, |
|
elem_classes=["loader-container"] |
|
) |
|
with loading_spinner: |
|
gr.HTML(""" |
|
<div class="loader"></div> |
|
<p>Wait...</p> |
|
""") |
|
|
|
with gr.Row(elem_classes=["result-container"]): |
|
with gr.Column(elem_classes=["pdf-preview"]): |
|
gr.Markdown("### Original document preview") |
|
pdf_display = gr.Gallery( |
|
label="PDF page", |
|
show_label=False, |
|
elem_classes=["gallery-container"] |
|
) |
|
|
|
with gr.Column(elem_classes=["markdown-result"]): |
|
with gr.Row(elem_classes=["radio-group"]): |
|
display_mode = gr.Radio( |
|
["Original Markdown", "Cleaned Text"], |
|
label="Display Mode", |
|
value="Original Markdown", |
|
interactive=True |
|
) |
|
markdown_display = gr.HTML(label="Analysis Results") |
|
with gr.Column(elem_classes=["download-section"]): |
|
gr.Markdown("### Result Export") |
|
with gr.Row(): |
|
download_md_btn = gr.Button("Download Original Markdown", variant="secondary") |
|
download_all_btn = gr.Button("Download Complete Analysis Results (ZIP)", variant="primary") |
|
download_file = gr.File(visible=False, label="Download file", elem_classes=["file-download"]) |
|
|
|
def toggle_spinner(): |
|
return gr.update(visible=True) |
|
|
|
|
|
def hide_spinner(): |
|
return gr.update(visible=False) |
|
|
|
|
|
process_btn.click( |
|
toggle_spinner, |
|
outputs=[loading_spinner] |
|
).then( |
|
process_file, |
|
inputs=[file_input, file_type], |
|
outputs=[results_state] |
|
).then( |
|
hide_spinner, |
|
outputs=[loading_spinner] |
|
).success( |
|
lambda res: res["pdf_images"] if res and res.get("pdf_images") else [], |
|
inputs=[results_state], |
|
outputs=[pdf_display] |
|
).success( |
|
lambda res: res["markdown_contents"][0] if res and res.get("markdown_contents") else "", |
|
inputs=[results_state], |
|
outputs=[markdown_display] |
|
) |
|
|
|
display_mode.change( |
|
lambda mode, res: ( |
|
res["markdown_contents"][0] if mode == "原始Markdown" |
|
else res["clean_markdown_contents"][0] |
|
) if res else "", |
|
inputs=[display_mode, results_state], |
|
outputs=[markdown_display] |
|
) |
|
|
|
download_md_btn.click( |
|
export_markdown, |
|
inputs=[results_state], |
|
outputs=[download_file] |
|
).then( |
|
lambda x: gr.update(visible=True), |
|
inputs=[download_file], |
|
outputs=[download_file] |
|
) |
|
|
|
download_all_btn.click( |
|
create_zip_file, |
|
inputs=[results_state], |
|
outputs=[download_file] |
|
).then( |
|
lambda x: gr.update(visible=True), |
|
inputs=[download_file], |
|
outputs=[download_file] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(share=True) |