|
import base64
|
|
import io
|
|
import json
|
|
import os
|
|
import re
|
|
import tempfile
|
|
import zipfile
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import gradio as gr
|
|
import requests
|
|
from PIL import Image
|
|
import pdf2image
|
|
|
|
API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing"
|
|
TOKEN = os.getenv("API_TOKEN", "c9e4aaf9634724e215690ba66a66dbdbdf3222a2")
|
|
|
|
CSS = """
|
|
:root {
|
|
--sand-color: #FAF9F6;
|
|
--white: #ffffff;
|
|
--shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
|
--text-color: #FAF9F6;
|
|
--black:#000000;
|
|
}
|
|
|
|
body {
|
|
display: flex;
|
|
justify-content: center;
|
|
background-color: var(--sand-color);
|
|
color: var(--text-color);
|
|
}
|
|
|
|
.gradio-container {
|
|
max-width: 1200px;
|
|
width: 100%;
|
|
margin: 20px auto;
|
|
padding: 20px;
|
|
background-color: var(--white);
|
|
border-radius: 8px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
|
|
#component-0,
|
|
#tabs,
|
|
#settings {
|
|
background-color: var(--white) !important;
|
|
border-radius: 8px;
|
|
padding: 15px;
|
|
}
|
|
|
|
.upload-section {
|
|
width: 100%;
|
|
max-width: 600px;
|
|
margin: 0 auto 30px;
|
|
padding: 20px;
|
|
background-color: var(--sand-color) !important;
|
|
border-radius: 8px;
|
|
box-shadow: var(--shadow);
|
|
}
|
|
|
|
.center-content {
|
|
display: flex;
|
|
flex-direction: column;
|
|
align-items: center;
|
|
text-align: center;
|
|
margin-bottom: 20px;
|
|
}
|
|
|
|
.header {
|
|
margin-bottom: 30px;
|
|
}
|
|
|
|
.result-container,
|
|
.pdf-preview,
|
|
.markdown-result,
|
|
.download-section {
|
|
background-color: var(--white);
|
|
border-radius: 8px;
|
|
box-shadow: var(--shadow);
|
|
padding: 20px;
|
|
}
|
|
|
|
.result-container {
|
|
display: flex;
|
|
gap: 20px;
|
|
margin-bottom: 30px;
|
|
}
|
|
|
|
.pdf-preview, .markdown-result {
|
|
flex: 1;
|
|
}
|
|
|
|
button {
|
|
background-color: var(--text-color) !important;
|
|
color: var(--black) !important;
|
|
border: none !important;
|
|
border-radius: 4px;
|
|
padding: 8px 16px;
|
|
}
|
|
|
|
button:hover {
|
|
opacity: 0.8 !important;
|
|
}
|
|
|
|
.radio-group {
|
|
margin-bottom: 15px !important;
|
|
}
|
|
|
|
.file-download {
|
|
margin-top: 15px !important;
|
|
}
|
|
.loader {
|
|
border: 5px solid #f3f3f3;
|
|
border-top: 5px solid #3498db;
|
|
border-radius: 50%;
|
|
width: 50px;
|
|
height: 50px;
|
|
animation: spin 1s linear infinite;
|
|
margin: 20px auto;
|
|
}
|
|
|
|
@keyframes spin {
|
|
0% { transform: rotate(0deg); }
|
|
100% { transform: rotate(360deg); }
|
|
}
|
|
|
|
.loader-container {
|
|
text-align: center;
|
|
margin: 20px 0;
|
|
}
|
|
"""
|
|
|
|
|
|
def clean_markdown_text(text):
|
|
if not text:
|
|
return ""
|
|
text = re.sub(r'<[^>]+>', '', text)
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
return text.strip()
|
|
|
|
|
|
def pdf_to_images(pdf_path):
|
|
try:
|
|
images = pdf2image.convert_from_path(pdf_path)
|
|
return [image for image in images]
|
|
except:
|
|
return None
|
|
|
|
|
|
def process_file(file_path, file_type):
|
|
try:
|
|
with open(file_path, "rb") as f:
|
|
file_bytes = f.read()
|
|
|
|
file_data = base64.b64encode(file_bytes).decode("ascii")
|
|
headers = {
|
|
"Authorization": f"token {TOKEN}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
response = requests.post(
|
|
API_URL,
|
|
json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
layout_results = result.get("result", {}).get("layoutParsingResults", [])
|
|
|
|
markdown_contents = []
|
|
clean_markdown_contents = []
|
|
for res in layout_results:
|
|
markdown = res.get("markdown", {})
|
|
if isinstance(markdown, str):
|
|
original = markdown
|
|
elif isinstance(markdown, dict):
|
|
original = markdown.get("text", "")
|
|
|
|
markdown_contents.append(original)
|
|
clean_markdown_contents.append(clean_markdown_text(original))
|
|
|
|
if file_type == "pdf":
|
|
images = pdf_to_images(file_path)
|
|
else:
|
|
images = [Image.open(file_path)]
|
|
|
|
return {
|
|
"original_file": file_path,
|
|
"markdown_contents": markdown_contents,
|
|
"clean_markdown_contents": clean_markdown_contents,
|
|
"pdf_images": images,
|
|
"api_response": result
|
|
}
|
|
|
|
except Exception as e:
|
|
raise gr.Error(f"Error processing file: {str(e)}")
|
|
|
|
|
|
def create_zip_file(results):
|
|
try:
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
zip_filename = f"analysis_results_{timestamp}.zip"
|
|
|
|
temp_dir = tempfile.mkdtemp()
|
|
zip_path = os.path.join(temp_dir, zip_filename)
|
|
|
|
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
original_path = results.get("original_file", "")
|
|
if original_path and Path(original_path).exists():
|
|
zipf.write(original_path, f"original/{Path(original_path).name}")
|
|
|
|
markdowns = results.get("markdown_contents", [])
|
|
for i, md_content in enumerate(markdowns):
|
|
if md_content:
|
|
zipf.writestr(f"markdown/original/markdown_{i + 1}.md", md_content)
|
|
|
|
api_response = results.get("api_response", {})
|
|
zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False))
|
|
|
|
return zip_path
|
|
|
|
except Exception as e:
|
|
raise gr.Error(f"Error creating ZIP file: {str(e)}")
|
|
|
|
|
|
def export_markdown(results):
|
|
try:
|
|
markdowns = results.get("markdown_contents", [])
|
|
if not markdowns:
|
|
raise gr.Error("No markdown content to export")
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"original_markdown_{timestamp}.md"
|
|
content = "\n\n".join(markdowns)
|
|
|
|
temp_dir = tempfile.mkdtemp()
|
|
file_path = os.path.join(temp_dir, filename)
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
return file_path
|
|
|
|
except Exception as e:
|
|
raise gr.Error(f"Error exporting markdown: {str(e)}")
|
|
|
|
|
|
with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
|
|
results_state = gr.State()
|
|
|
|
with gr.Column(elem_classes=["center-content", "header"]):
|
|
gr.Markdown("# Document Parsing System")
|
|
gr.Markdown("### Upload PDF or image files for analysis")
|
|
|
|
with gr.Column(elem_classes=["center-content", "upload-section"]):
|
|
file_type = gr.Radio(
|
|
["pdf", "image"],
|
|
label="File type",
|
|
value="pdf",
|
|
interactive=True
|
|
)
|
|
file_input = gr.File(
|
|
label="Upload document",
|
|
file_types=[".pdf", ".jpg", ".jpeg", ".png"],
|
|
type="filepath"
|
|
)
|
|
process_btn = gr.Button("Analyze document", variant="primary")
|
|
|
|
|
|
loading_spinner = gr.Column(
|
|
visible=False,
|
|
elem_classes=["loader-container"]
|
|
)
|
|
with loading_spinner:
|
|
gr.HTML("""
|
|
<div class="loader"></div>
|
|
<p>Wait...</p>
|
|
""")
|
|
|
|
with gr.Row(elem_classes=["result-container"]):
|
|
with gr.Column(elem_classes=["pdf-preview"]):
|
|
gr.Markdown("### Original document preview")
|
|
pdf_display = gr.Gallery(label="PDF page", show_label=False)
|
|
|
|
with gr.Column(elem_classes=["markdown-result"]):
|
|
with gr.Row(elem_classes=["radio-group"]):
|
|
display_mode = gr.Radio(
|
|
["Original Markdown", "Cleaned Text"],
|
|
label="Display Mode",
|
|
value="Original Markdown",
|
|
interactive=True
|
|
)
|
|
markdown_display = gr.HTML(label="Analysis Results")
|
|
|
|
with gr.Column(elem_classes=["download-section"]):
|
|
gr.Markdown("### Result Export")
|
|
with gr.Row():
|
|
download_md_btn = gr.Button("Download Original Markdown", variant="secondary")
|
|
download_all_btn = gr.Button("Download Complete Analysis Results (ZIP)", variant="primary")
|
|
download_file = gr.File(visible=False, label="Download file", elem_classes=["file-download"])
|
|
|
|
|
|
def toggle_spinner():
|
|
return gr.update(visible=True)
|
|
|
|
|
|
def hide_spinner():
|
|
return gr.update(visible=False)
|
|
|
|
|
|
process_btn.click(
|
|
toggle_spinner,
|
|
outputs=[loading_spinner]
|
|
).then(
|
|
process_file,
|
|
inputs=[file_input, file_type],
|
|
outputs=[results_state]
|
|
).then(
|
|
hide_spinner,
|
|
outputs=[loading_spinner]
|
|
).success(
|
|
lambda res: res["pdf_images"] if res and res.get("pdf_images") else [],
|
|
inputs=[results_state],
|
|
outputs=[pdf_display]
|
|
).success(
|
|
lambda res: res["markdown_contents"][0] if res and res.get("markdown_contents") else "",
|
|
inputs=[results_state],
|
|
outputs=[markdown_display]
|
|
)
|
|
|
|
display_mode.change(
|
|
lambda mode, res: (
|
|
res["markdown_contents"][0] if mode == "原始Markdown"
|
|
else res["clean_markdown_contents"][0]
|
|
) if res else "",
|
|
inputs=[display_mode, results_state],
|
|
outputs=[markdown_display]
|
|
)
|
|
|
|
download_md_btn.click(
|
|
export_markdown,
|
|
inputs=[results_state],
|
|
outputs=[download_file]
|
|
).then(
|
|
lambda x: gr.update(visible=True),
|
|
inputs=[download_file],
|
|
outputs=[download_file]
|
|
)
|
|
|
|
download_all_btn.click(
|
|
create_zip_file,
|
|
inputs=[results_state],
|
|
outputs=[download_file]
|
|
).then(
|
|
lambda x: gr.update(visible=True),
|
|
inputs=[download_file],
|
|
outputs=[download_file]
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |