XinyueZhou's picture
Update app.py
8767ac6 verified
import base64
import io
import json
import os
import re
import tempfile
import zipfile
import shutil
import atexit
from datetime import datetime
from pathlib import Path
import gradio as gr
import requests
from PIL import Image
try:
import pdf2image
PDF2IMAGE_AVAILABLE = True
except ImportError:
PDF2IMAGE_AVAILABLE = False
try:
import fitz # PyMuPDF
PYGMUPDF_AVAILABLE = True
except ImportError:
PYGMUPDF_AVAILABLE = False
# API Configuration
API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing"
TOKEN = os.getenv("API_TOKEN")
# Temporary directory management
temp_dirs = []
def cleanup():
"""Clean up temporary directories"""
for dir_path in temp_dirs:
try:
shutil.rmtree(dir_path)
except:
pass
atexit.register(cleanup)
def image_to_base64(image_path):
"""Convert image to base64 encoding"""
if not image_path or not Path(image_path).exists():
return ""
with open(image_path, "rb") as image_file:
return f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"
# Get current directory
current_dir = Path(__file__).parent
logo_path = current_dir / "pp-structurev3.png"
logo_base64 = image_to_base64(logo_path)
CSS = """
:root {
--sand-color: #FAF9F6;
--white: #ffffff;
--shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
--text-color: #F3F4F7;
--black:#000000;
--link-hover: #2b6cb0;
--content-width: 1200px;
}
body {
display: flex;
justify-content: center;
background-color: var(--sand-color);
color: var(--text-color);
font-family: Arial, sans-serif;
}
.gradio-container {
max-width: var(--content-width) !important;
width: 100% !important;
margin: 20px auto;
padding: 20px;
background-color: var(--white);
}
#component-0,
#tabs,
#settings {
background-color: var(--white) !important;
padding: 15px;
}
.upload-section {
width: 100%;
margin: 0 auto 30px;
padding: 20px;
background-color: var(--sand-color) !important;
border-radius: 8px;
box-shadow: var(--shadow);
}
.center-content {
display: flex;
flex-direction: column;
align-items: center;
text-align: center;
margin-bottom: 20px;
}
.header {
margin-bottom: 30px;
width: 100%;
}
.logo-container {
width: 100%;
margin-bottom: 20px;
}
.logo-img {
width: 100%;
max-width: var(--content-width);
margin: 0 auto;
display: block;
}
.nav-bar {
display: flex;
justify-content: center;
background-color: var(--white);
padding: 15px 0;
box-shadow: var(--shadow);
margin-bottom: 20px;
}
.nav-links {
display: flex;
gap: 30px;
width: 100%;
justify-content: center;
}
.nav-link {
color: var(--black);
text-decoration: none;
font-weight: bold;
font-size: 24px;
transition: color 0.2s;
}
.nav-link:hover {
color: var(--link-hover);
text-decoration: none;
}
.result-container {
display: flex;
gap: 20px;
margin-bottom: 30px;
width: 100%;
}
.pdf-preview {
flex: 1;
min-width: 0;
}
.markdown-result {
flex: 1;
min-width: 0;
}
.gallery-container {
width: 100% !important;
}
.gallery-item {
width: 100% !important;
height: auto !important;
aspect-ratio: auto !important;
}
button {
background-color: var(--text-color) !important;
color: var(--black) !important;
border: none !important;
border-radius: 4px;
padding: 8px 16px;
}
button:hover {
opacity: 0.8 !important;
}
.radio-group {
margin-bottom: 15px !important;
}
.file-download {
margin-top: 15px !important;
}
.loader {
border: 5px solid #f3f3f3;
border-top: 5px solid #3498db;
border-radius: 50%;
width: 50px;
height: 50px;
animation: spin 1s linear infinite;
margin: 20px auto;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.loader-container {
text-align: center;
margin: 20px 0;
}
/* PDF Viewer specific styles */
.pdf-viewer-container {
width: 100%;
height: 600px;
border: 1px solid #ddd;
margin-top: 15px;
background-color: #f9f9f9;
display: flex;
justify-content: center;
align-items: center;
}
.pdf-viewer-container embed {
width: 100%;
height: 100%;
}
.no-preview-message {
color: #666;
font-size: 16px;
text-align: center;
padding: 20px;
}
"""
def clean_markdown_text(text):
"""Clean markdown text from HTML tags and excessive newlines"""
if not text:
return ""
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def pdf_to_images(pdf_path, dpi=150):
"""Convert PDF to list of images with fallback methods"""
images = []
if PDF2IMAGE_AVAILABLE:
try:
images = pdf2image.convert_from_path(pdf_path, dpi=dpi)
return images
except Exception as e:
print(f"pdf2image conversion failed: {str(e)}")
if PYGMUPDF_AVAILABLE:
try:
doc = fitz.open(pdf_path)
for page in doc:
pix = page.get_pixmap(dpi=dpi)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
except Exception as e:
print(f"PyMuPDF conversion failed: {str(e)}")
return None
def create_pdf_preview(pdf_path):
"""Create PDF preview HTML with embedded viewer"""
if not pdf_path or not Path(pdf_path).exists():
return '<div class="no-preview-message">No PDF file available</div>'
try:
# Convert PDF to base64 for embedding
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
pdf_base64 = base64.b64encode(pdf_bytes).decode("ascii")
return f"""
<div class="pdf-viewer-container">
<embed
src="data:application/pdf;base64,{pdf_base64}"
type="application/pdf"
width="100%"
height="100%"
>
</div>
"""
except Exception as e:
print(f"Failed to create PDF preview: {str(e)}")
return '<div class="no-preview-message">PDF preview generation failed</div>'
def process_file(file_path, file_type):
"""Process uploaded file with API"""
try:
if not file_path:
raise ValueError("Please upload a file first")
if file_type == "pdf" and not str(file_path).lower().endswith('.pdf'):
raise ValueError("Please upload a valid PDF file")
if file_type == "image" and not str(file_path).lower().endswith(('.jpg', '.jpeg', '.png')):
raise ValueError("Please upload a valid image file (JPG/JPEG/PNG)")
# Read file content
with open(file_path, "rb") as f:
file_bytes = f.read()
# Call API for processing
file_data = base64.b64encode(file_bytes).decode("ascii")
headers = {
"Authorization": f"token {TOKEN}",
"Content-Type": "application/json"
}
response = requests.post(
API_URL,
json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1},
headers=headers,
timeout=60
)
response.raise_for_status()
# Parse API response
result = response.json()
layout_results = result.get("result", {}).get("layoutParsingResults", [])
markdown_contents = []
clean_markdown_contents = []
for res in layout_results:
markdown = res.get("markdown", {})
original = markdown if isinstance(markdown, str) else markdown.get("text", "")
markdown_contents.append(original)
clean_markdown_contents.append(clean_markdown_text(original))
# Generate preview content
if file_type == "pdf":
images = pdf_to_images(file_path)
pdf_preview = create_pdf_preview(file_path)
else:
images = [Image.open(file_path)]
pdf_preview = '<div class="no-preview-message">Image file preview</div>'
return {
"original_file": file_path,
"file_type": file_type,
"markdown_contents": markdown_contents,
"clean_markdown_contents": clean_markdown_contents,
"pdf_images": images,
"pdf_preview": pdf_preview,
"api_response": result
}
except requests.exceptions.RequestException as e:
raise gr.Error(f"API request failed: {str(e)}")
except Exception as e:
raise gr.Error(f"Error processing file: {str(e)}")
def create_zip_file(results):
"""Create ZIP file with all analysis results"""
try:
if not results:
raise ValueError("No results to export")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_filename = f"analysis_results_{timestamp}.zip"
temp_dir = tempfile.mkdtemp()
temp_dirs.append(temp_dir)
zip_path = os.path.join(temp_dir, zip_filename)
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
# Add original file
original_path = results.get("original_file", "")
if original_path and Path(original_path).exists():
zipf.write(original_path, f"original/{Path(original_path).name}")
# Add markdown content
for i, (orig_md, clean_md) in enumerate(zip(
results.get("markdown_contents", []),
results.get("clean_markdown_contents", [])
)):
if orig_md:
zipf.writestr(f"markdown/original/page_{i+1}.md", orig_md)
if clean_md:
zipf.writestr(f"markdown/clean/page_{i+1}.md", clean_md)
# Add API response
api_response = results.get("api_response", {})
zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False))
# Add PDF images if available
if results.get("file_type") == "pdf" and results.get("pdf_images"):
for i, img in enumerate(results["pdf_images"]):
img_path = os.path.join(temp_dir, f"page_{i+1}.jpg")
img.save(img_path, "JPEG", quality=85)
zipf.write(img_path, f"images/page_{i+1}.jpg")
return zip_path
except Exception as e:
raise gr.Error(f"Error creating ZIP file: {str(e)}")
def export_markdown(results):
"""Export markdown content to file"""
try:
if not results:
raise ValueError("No results to export")
markdowns = results.get("markdown_contents", [])
if not markdowns:
raise gr.Error("No markdown content to export")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"markdown_export_{timestamp}.md"
content = "\n\n".join(markdowns)
temp_dir = tempfile.mkdtemp()
temp_dirs.append(temp_dir)
file_path = os.path.join(temp_dir, filename)
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
return file_path
except Exception as e:
raise gr.Error(f"Error exporting markdown: {str(e)}")
with gr.Blocks(css=CSS, title="Document Analysis System") as demo:
results_state = gr.State()
# Header with logo
with gr.Column(elem_classes=["logo-container"]):
gr.HTML(f'<img src="{logo_base64}" class="logo-img">')
# Navigation bar
with gr.Row(elem_classes=["nav-bar"]):
gr.HTML("""
<div class="nav-links">
<a href="https://github.com/PaddlePaddle/PaddleOCR" class="nav-link" target="_blank">GitHub</a>
<a href="https://paddleocr.ai" class="nav-link" target="_blank">paddleocr.ai</a>
</div>
""")
# Upload section
with gr.Column(elem_classes=["upload-section"]):
file_type = gr.Radio(
["pdf", "image"],
label="File Type",
value="pdf",
interactive=True
)
file_input = gr.File(
label="Upload Document",
file_types=[".pdf", ".jpg", ".jpeg", ".png"],
type="filepath"
)
process_btn = gr.Button("Analyze Document", variant="primary")
loading_spinner = gr.Column(
visible=False,
elem_classes=["loader-container"]
)
with loading_spinner:
gr.HTML("""
<div class="loader"></div>
<p>Processing, please wait...</p>
""")
# Results display section
with gr.Row(elem_classes=["result-container"]):
with gr.Column(elem_classes=["pdf-preview"]):
gr.Markdown("### Original Document Preview")
pdf_preview = gr.HTML(label="PDF Preview")
pdf_gallery = gr.Gallery(
label="PDF Pages",
show_label=False,
elem_classes=["gallery-container"],
columns=[1],
object_fit="contain",
visible=False
)
with gr.Column(elem_classes=["markdown-result"]):
with gr.Row(elem_classes=["radio-group"]):
display_mode = gr.Radio(
["Original Markdown", "Cleaned Text"],
label="Display Mode",
value="Original Markdown",
interactive=True
)
markdown_display = gr.Markdown(label="Analysis Results")
# Download section
with gr.Column(elem_classes=["download-section"]):
gr.Markdown("### Result Export")
with gr.Row():
download_md_btn = gr.Button("Download Markdown", variant="secondary")
download_all_btn = gr.Button("Download Full Results (ZIP)", variant="primary")
download_file = gr.File(visible=False, label="Download File")
# Interaction logic
def toggle_spinner():
return gr.update(visible=True)
def hide_spinner():
return gr.update(visible=False)
# In update_display()
def update_display(results):
if not results:
return [
gr.update(value='<div class="no-preview-message">No file to display</div>'),
gr.update(visible=False),
gr.update(value="No content"),
gr.update(value=[])
]
images = results.get("pdf_images", [])
show_gallery = bool(images)
display_content = results["markdown_contents"][0] if results.get("markdown_contents") else "No content"
return [
gr.update(value='<div class="no-preview-message">Preview rendered as images</div>'),
gr.update(visible=show_gallery),
gr.update(value=display_content),
gr.update(value=images if show_gallery else [])
]
process_btn.click(
toggle_spinner,
outputs=[loading_spinner]
).then(
process_file,
inputs=[file_input, file_type],
outputs=[results_state]
).then(
hide_spinner,
outputs=[loading_spinner]
).then(
update_display,
inputs=[results_state],
outputs=[pdf_preview, pdf_gallery, markdown_display, pdf_gallery]
)
display_mode.change(
lambda mode, res: (
res["markdown_contents"][0] if mode == "Original Markdown"
else res["clean_markdown_contents"][0]
) if res and res.get("markdown_contents") else "No content",
inputs=[display_mode, results_state],
outputs=[markdown_display]
)
download_md_btn.click(
export_markdown,
inputs=[results_state],
outputs=[download_file]
).then(
lambda: gr.update(visible=True),
outputs=[download_file]
)
download_all_btn.click(
create_zip_file,
inputs=[results_state],
outputs=[download_file]
).then(
lambda: gr.update(visible=True),
outputs=[download_file]
)
if __name__ == "__main__":
# Check dependencies
if not PDF2IMAGE_AVAILABLE:
print("Warning: pdf2image not available, PDF to image conversion limited")
if not PYGMUPDF_AVAILABLE:
print("Warning: PyMuPDF not available, PDF fallback conversion disabled")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
favicon_path=str(logo_path) if logo_path.exists() else None
)