|
import base64 |
|
import io |
|
import json |
|
import os |
|
import re |
|
import tempfile |
|
import zipfile |
|
import shutil |
|
import atexit |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import requests |
|
from PIL import Image |
|
|
|
try: |
|
import pdf2image |
|
PDF2IMAGE_AVAILABLE = True |
|
except ImportError: |
|
PDF2IMAGE_AVAILABLE = False |
|
|
|
try: |
|
import fitz |
|
PYGMUPDF_AVAILABLE = True |
|
except ImportError: |
|
PYGMUPDF_AVAILABLE = False |
|
|
|
|
|
API_URL = "https://t707h6d9q6oftbx3.aistudio-app.com/layout-parsing" |
|
TOKEN = os.getenv("API_TOKEN") |
|
|
|
|
|
temp_dirs = [] |
|
|
|
def cleanup(): |
|
"""Clean up temporary directories""" |
|
for dir_path in temp_dirs: |
|
try: |
|
shutil.rmtree(dir_path) |
|
except: |
|
pass |
|
|
|
atexit.register(cleanup) |
|
|
|
def image_to_base64(image_path): |
|
"""Convert image to base64 encoding""" |
|
if not image_path or not Path(image_path).exists(): |
|
return "" |
|
with open(image_path, "rb") as image_file: |
|
return f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}" |
|
|
|
|
|
current_dir = Path(__file__).parent |
|
logo_path = current_dir / "pp-structurev3.png" |
|
logo_base64 = image_to_base64(logo_path) |
|
|
|
CSS = """ |
|
:root { |
|
--sand-color: #FAF9F6; |
|
--white: #ffffff; |
|
--shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
--text-color: #F3F4F7; |
|
--black:#000000; |
|
--link-hover: #2b6cb0; |
|
--content-width: 1200px; |
|
} |
|
|
|
body { |
|
display: flex; |
|
justify-content: center; |
|
background-color: var(--sand-color); |
|
color: var(--text-color); |
|
font-family: Arial, sans-serif; |
|
} |
|
|
|
.gradio-container { |
|
max-width: var(--content-width) !important; |
|
width: 100% !important; |
|
margin: 20px auto; |
|
padding: 20px; |
|
background-color: var(--white); |
|
} |
|
|
|
#component-0, |
|
#tabs, |
|
#settings { |
|
background-color: var(--white) !important; |
|
padding: 15px; |
|
} |
|
|
|
.upload-section { |
|
width: 100%; |
|
margin: 0 auto 30px; |
|
padding: 20px; |
|
background-color: var(--sand-color) !important; |
|
border-radius: 8px; |
|
box-shadow: var(--shadow); |
|
} |
|
|
|
.center-content { |
|
display: flex; |
|
flex-direction: column; |
|
align-items: center; |
|
text-align: center; |
|
margin-bottom: 20px; |
|
} |
|
|
|
.header { |
|
margin-bottom: 30px; |
|
width: 100%; |
|
} |
|
|
|
.logo-container { |
|
width: 100%; |
|
margin-bottom: 20px; |
|
} |
|
|
|
.logo-img { |
|
width: 100%; |
|
max-width: var(--content-width); |
|
margin: 0 auto; |
|
display: block; |
|
} |
|
|
|
.nav-bar { |
|
display: flex; |
|
justify-content: center; |
|
background-color: var(--white); |
|
padding: 15px 0; |
|
box-shadow: var(--shadow); |
|
margin-bottom: 20px; |
|
} |
|
|
|
.nav-links { |
|
display: flex; |
|
gap: 30px; |
|
width: 100%; |
|
justify-content: center; |
|
} |
|
|
|
.nav-link { |
|
color: var(--black); |
|
text-decoration: none; |
|
font-weight: bold; |
|
font-size: 24px; |
|
transition: color 0.2s; |
|
} |
|
|
|
.nav-link:hover { |
|
color: var(--link-hover); |
|
text-decoration: none; |
|
} |
|
|
|
.result-container { |
|
display: flex; |
|
gap: 20px; |
|
margin-bottom: 30px; |
|
width: 100%; |
|
} |
|
|
|
.pdf-preview { |
|
flex: 1; |
|
min-width: 0; |
|
} |
|
|
|
.markdown-result { |
|
flex: 1; |
|
min-width: 0; |
|
} |
|
|
|
.gallery-container { |
|
width: 100% !important; |
|
} |
|
|
|
.gallery-item { |
|
width: 100% !important; |
|
height: auto !important; |
|
aspect-ratio: auto !important; |
|
} |
|
|
|
button { |
|
background-color: var(--text-color) !important; |
|
color: var(--black) !important; |
|
border: none !important; |
|
border-radius: 4px; |
|
padding: 8px 16px; |
|
} |
|
button:hover { |
|
opacity: 0.8 !important; |
|
} |
|
|
|
.radio-group { |
|
margin-bottom: 15px !important; |
|
} |
|
|
|
.file-download { |
|
margin-top: 15px !important; |
|
} |
|
.loader { |
|
border: 5px solid #f3f3f3; |
|
border-top: 5px solid #3498db; |
|
border-radius: 50%; |
|
width: 50px; |
|
height: 50px; |
|
animation: spin 1s linear infinite; |
|
margin: 20px auto; |
|
} |
|
|
|
@keyframes spin { |
|
0% { transform: rotate(0deg); } |
|
100% { transform: rotate(360deg); } |
|
} |
|
|
|
.loader-container { |
|
text-align: center; |
|
margin: 20px 0; |
|
} |
|
|
|
/* PDF Viewer specific styles */ |
|
.pdf-viewer-container { |
|
width: 100%; |
|
height: 600px; |
|
border: 1px solid #ddd; |
|
margin-top: 15px; |
|
background-color: #f9f9f9; |
|
display: flex; |
|
justify-content: center; |
|
align-items: center; |
|
} |
|
|
|
.pdf-viewer-container embed { |
|
width: 100%; |
|
height: 100%; |
|
} |
|
|
|
.no-preview-message { |
|
color: #666; |
|
font-size: 16px; |
|
text-align: center; |
|
padding: 20px; |
|
} |
|
""" |
|
|
|
def clean_markdown_text(text): |
|
"""Clean markdown text from HTML tags and excessive newlines""" |
|
if not text: |
|
return "" |
|
text = re.sub(r'<[^>]+>', '', text) |
|
text = re.sub(r'\n{3,}', '\n\n', text) |
|
return text.strip() |
|
|
|
def pdf_to_images(pdf_path, dpi=150): |
|
"""Convert PDF to list of images with fallback methods""" |
|
images = [] |
|
|
|
if PDF2IMAGE_AVAILABLE: |
|
try: |
|
images = pdf2image.convert_from_path(pdf_path, dpi=dpi) |
|
return images |
|
except Exception as e: |
|
print(f"pdf2image conversion failed: {str(e)}") |
|
|
|
if PYGMUPDF_AVAILABLE: |
|
try: |
|
doc = fitz.open(pdf_path) |
|
for page in doc: |
|
pix = page.get_pixmap(dpi=dpi) |
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
images.append(img) |
|
return images |
|
except Exception as e: |
|
print(f"PyMuPDF conversion failed: {str(e)}") |
|
|
|
return None |
|
|
|
def create_pdf_preview(pdf_path): |
|
"""Create PDF preview HTML with embedded viewer""" |
|
if not pdf_path or not Path(pdf_path).exists(): |
|
return '<div class="no-preview-message">No PDF file available</div>' |
|
|
|
try: |
|
|
|
with open(pdf_path, "rb") as f: |
|
pdf_bytes = f.read() |
|
pdf_base64 = base64.b64encode(pdf_bytes).decode("ascii") |
|
|
|
return f""" |
|
<div class="pdf-viewer-container"> |
|
<embed |
|
src="data:application/pdf;base64,{pdf_base64}" |
|
type="application/pdf" |
|
width="100%" |
|
height="100%" |
|
> |
|
</div> |
|
""" |
|
except Exception as e: |
|
print(f"Failed to create PDF preview: {str(e)}") |
|
return '<div class="no-preview-message">PDF preview generation failed</div>' |
|
|
|
def process_file(file_path, file_type): |
|
"""Process uploaded file with API""" |
|
try: |
|
if not file_path: |
|
raise ValueError("Please upload a file first") |
|
|
|
if file_type == "pdf" and not str(file_path).lower().endswith('.pdf'): |
|
raise ValueError("Please upload a valid PDF file") |
|
|
|
if file_type == "image" and not str(file_path).lower().endswith(('.jpg', '.jpeg', '.png')): |
|
raise ValueError("Please upload a valid image file (JPG/JPEG/PNG)") |
|
|
|
|
|
with open(file_path, "rb") as f: |
|
file_bytes = f.read() |
|
|
|
|
|
file_data = base64.b64encode(file_bytes).decode("ascii") |
|
headers = { |
|
"Authorization": f"token {TOKEN}", |
|
"Content-Type": "application/json" |
|
} |
|
|
|
response = requests.post( |
|
API_URL, |
|
json={"file": file_data, "fileType": 0 if file_type == "pdf" else 1}, |
|
headers=headers, |
|
timeout=60 |
|
) |
|
response.raise_for_status() |
|
|
|
|
|
result = response.json() |
|
layout_results = result.get("result", {}).get("layoutParsingResults", []) |
|
|
|
markdown_contents = [] |
|
clean_markdown_contents = [] |
|
for res in layout_results: |
|
markdown = res.get("markdown", {}) |
|
original = markdown if isinstance(markdown, str) else markdown.get("text", "") |
|
markdown_contents.append(original) |
|
clean_markdown_contents.append(clean_markdown_text(original)) |
|
|
|
|
|
if file_type == "pdf": |
|
images = pdf_to_images(file_path) |
|
pdf_preview = create_pdf_preview(file_path) |
|
else: |
|
images = [Image.open(file_path)] |
|
pdf_preview = '<div class="no-preview-message">Image file preview</div>' |
|
|
|
return { |
|
"original_file": file_path, |
|
"file_type": file_type, |
|
"markdown_contents": markdown_contents, |
|
"clean_markdown_contents": clean_markdown_contents, |
|
"pdf_images": images, |
|
"pdf_preview": pdf_preview, |
|
"api_response": result |
|
} |
|
|
|
except requests.exceptions.RequestException as e: |
|
raise gr.Error(f"API request failed: {str(e)}") |
|
except Exception as e: |
|
raise gr.Error(f"Error processing file: {str(e)}") |
|
|
|
def create_zip_file(results): |
|
"""Create ZIP file with all analysis results""" |
|
try: |
|
if not results: |
|
raise ValueError("No results to export") |
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
zip_filename = f"analysis_results_{timestamp}.zip" |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
temp_dirs.append(temp_dir) |
|
zip_path = os.path.join(temp_dir, zip_filename) |
|
|
|
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: |
|
|
|
original_path = results.get("original_file", "") |
|
if original_path and Path(original_path).exists(): |
|
zipf.write(original_path, f"original/{Path(original_path).name}") |
|
|
|
|
|
for i, (orig_md, clean_md) in enumerate(zip( |
|
results.get("markdown_contents", []), |
|
results.get("clean_markdown_contents", []) |
|
)): |
|
if orig_md: |
|
zipf.writestr(f"markdown/original/page_{i+1}.md", orig_md) |
|
if clean_md: |
|
zipf.writestr(f"markdown/clean/page_{i+1}.md", clean_md) |
|
|
|
|
|
api_response = results.get("api_response", {}) |
|
zipf.writestr("api_response.json", json.dumps(api_response, indent=2, ensure_ascii=False)) |
|
|
|
|
|
if results.get("file_type") == "pdf" and results.get("pdf_images"): |
|
for i, img in enumerate(results["pdf_images"]): |
|
img_path = os.path.join(temp_dir, f"page_{i+1}.jpg") |
|
img.save(img_path, "JPEG", quality=85) |
|
zipf.write(img_path, f"images/page_{i+1}.jpg") |
|
|
|
return zip_path |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error creating ZIP file: {str(e)}") |
|
|
|
def export_markdown(results): |
|
"""Export markdown content to file""" |
|
try: |
|
if not results: |
|
raise ValueError("No results to export") |
|
|
|
markdowns = results.get("markdown_contents", []) |
|
if not markdowns: |
|
raise gr.Error("No markdown content to export") |
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = f"markdown_export_{timestamp}.md" |
|
content = "\n\n".join(markdowns) |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
temp_dirs.append(temp_dir) |
|
file_path = os.path.join(temp_dir, filename) |
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
return file_path |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error exporting markdown: {str(e)}") |
|
|
|
with gr.Blocks(css=CSS, title="Document Analysis System") as demo: |
|
results_state = gr.State() |
|
|
|
|
|
with gr.Column(elem_classes=["logo-container"]): |
|
gr.HTML(f'<img src="{logo_base64}" class="logo-img">') |
|
|
|
|
|
with gr.Row(elem_classes=["nav-bar"]): |
|
gr.HTML(""" |
|
<div class="nav-links"> |
|
<a href="https://github.com/PaddlePaddle/PaddleOCR" class="nav-link" target="_blank">GitHub</a> |
|
<a href="https://paddleocr.ai" class="nav-link" target="_blank">paddleocr.ai</a> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Column(elem_classes=["upload-section"]): |
|
file_type = gr.Radio( |
|
["pdf", "image"], |
|
label="File Type", |
|
value="pdf", |
|
interactive=True |
|
) |
|
file_input = gr.File( |
|
label="Upload Document", |
|
file_types=[".pdf", ".jpg", ".jpeg", ".png"], |
|
type="filepath" |
|
) |
|
process_btn = gr.Button("Analyze Document", variant="primary") |
|
|
|
loading_spinner = gr.Column( |
|
visible=False, |
|
elem_classes=["loader-container"] |
|
) |
|
with loading_spinner: |
|
gr.HTML(""" |
|
<div class="loader"></div> |
|
<p>Processing, please wait...</p> |
|
""") |
|
|
|
|
|
with gr.Row(elem_classes=["result-container"]): |
|
with gr.Column(elem_classes=["pdf-preview"]): |
|
gr.Markdown("### Original Document Preview") |
|
pdf_preview = gr.HTML(label="PDF Preview") |
|
pdf_gallery = gr.Gallery( |
|
label="PDF Pages", |
|
show_label=False, |
|
elem_classes=["gallery-container"], |
|
columns=[1], |
|
object_fit="contain", |
|
visible=False |
|
) |
|
|
|
with gr.Column(elem_classes=["markdown-result"]): |
|
with gr.Row(elem_classes=["radio-group"]): |
|
display_mode = gr.Radio( |
|
["Original Markdown", "Cleaned Text"], |
|
label="Display Mode", |
|
value="Original Markdown", |
|
interactive=True |
|
) |
|
markdown_display = gr.Markdown(label="Analysis Results") |
|
|
|
|
|
with gr.Column(elem_classes=["download-section"]): |
|
gr.Markdown("### Result Export") |
|
with gr.Row(): |
|
download_md_btn = gr.Button("Download Markdown", variant="secondary") |
|
download_all_btn = gr.Button("Download Full Results (ZIP)", variant="primary") |
|
download_file = gr.File(visible=False, label="Download File") |
|
|
|
|
|
def toggle_spinner(): |
|
return gr.update(visible=True) |
|
|
|
def hide_spinner(): |
|
return gr.update(visible=False) |
|
|
|
|
|
def update_display(results): |
|
if not results: |
|
return [ |
|
gr.update(value='<div class="no-preview-message">No file to display</div>'), |
|
gr.update(visible=False), |
|
gr.update(value="No content"), |
|
gr.update(value=[]) |
|
] |
|
|
|
images = results.get("pdf_images", []) |
|
show_gallery = bool(images) |
|
display_content = results["markdown_contents"][0] if results.get("markdown_contents") else "No content" |
|
|
|
return [ |
|
gr.update(value='<div class="no-preview-message">Preview rendered as images</div>'), |
|
gr.update(visible=show_gallery), |
|
gr.update(value=display_content), |
|
gr.update(value=images if show_gallery else []) |
|
] |
|
|
|
|
|
process_btn.click( |
|
toggle_spinner, |
|
outputs=[loading_spinner] |
|
).then( |
|
process_file, |
|
inputs=[file_input, file_type], |
|
outputs=[results_state] |
|
).then( |
|
hide_spinner, |
|
outputs=[loading_spinner] |
|
).then( |
|
update_display, |
|
inputs=[results_state], |
|
outputs=[pdf_preview, pdf_gallery, markdown_display, pdf_gallery] |
|
) |
|
|
|
display_mode.change( |
|
lambda mode, res: ( |
|
res["markdown_contents"][0] if mode == "Original Markdown" |
|
else res["clean_markdown_contents"][0] |
|
) if res and res.get("markdown_contents") else "No content", |
|
inputs=[display_mode, results_state], |
|
outputs=[markdown_display] |
|
) |
|
|
|
download_md_btn.click( |
|
export_markdown, |
|
inputs=[results_state], |
|
outputs=[download_file] |
|
).then( |
|
lambda: gr.update(visible=True), |
|
outputs=[download_file] |
|
) |
|
|
|
download_all_btn.click( |
|
create_zip_file, |
|
inputs=[results_state], |
|
outputs=[download_file] |
|
).then( |
|
lambda: gr.update(visible=True), |
|
outputs=[download_file] |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
if not PDF2IMAGE_AVAILABLE: |
|
print("Warning: pdf2image not available, PDF to image conversion limited") |
|
if not PYGMUPDF_AVAILABLE: |
|
print("Warning: PyMuPDF not available, PDF fallback conversion disabled") |
|
|
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=True, |
|
favicon_path=str(logo_path) if logo_path.exists() else None |
|
) |