Spaces:
Running
Running
import os | |
import re | |
import sys | |
import json | |
import time | |
import copy | |
import base64 | |
import asyncio | |
import tempfile | |
import subprocess | |
from pathlib import Path | |
from datetime import datetime | |
import zipfile | |
import httpx, aiofiles, os, asyncio | |
import numpy as np | |
import gradio as gr | |
from PIL import Image | |
from pdf2image import convert_from_path | |
from loguru import logger | |
from openai import OpenAI, AsyncOpenAI | |
from gradio_pdf import PDF | |
import certifi | |
import httpx | |
import aiohttp | |
import uuid | |
import tqdm | |
import base64, pathlib | |
from io import BytesIO | |
from pdf2image import convert_from_bytes, convert_from_path # pip install pdf2image | |
import requests | |
def setup_poppler_linux(): | |
poppler_dir = "/tmp/poppler" | |
if not os.path.exists(poppler_dir): | |
os.makedirs(poppler_dir, exist_ok=True) | |
subprocess.run([ | |
"apt-get", "update" | |
], check=True) | |
subprocess.run([ | |
"apt-get", "install", "-y", "poppler-utils" | |
], check=True) | |
setup_poppler_linux() | |
preset_prompts = [ | |
"Please convert the document into Markdown format.", | |
"Generate a clean and structured Markdown version of the document.", | |
"Transform this content into Markdown with proper headings and bullet points.", | |
"Convert the text to Markdown, preserving structure and formatting.", | |
"Reformat this document as Markdown with clear sections and lists.", | |
] | |
def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None): | |
url = f"{openai_api_base}{route}" | |
headers = {} | |
if api_key: | |
headers["Authorization"] = f"Bearer {api_key}" | |
with open(file_path, "rb") as f: | |
files = {"file": (os.path.basename(file_path), f, "application/pdf")} | |
response = requests.post(url, files=files, headers=headers) | |
return response | |
async def send_pdf_async_aiohttp(file_path, server_ip, route="/upload", Authorization=None): | |
"""使用aiohttp异步发送PDF""" | |
url = f"{server_ip}{route}" | |
headers = {} | |
if Authorization: | |
headers["Authorization"] = f"Bearer {Authorization}" | |
try: | |
async with aiohttp.ClientSession() as session: | |
with open(file_path, "rb") as f: | |
data = aiohttp.FormData() | |
data.add_field('file', f, filename=os.path.basename(file_path), content_type='application/pdf') | |
async with session.post(url, data=data, headers=headers) as response: | |
print(f"PDF发送成功: {file_path}, 状态码: {response.status}") | |
return response | |
except Exception as e: | |
print(f"PDF发送失败: {file_path}, 错误: {e}") | |
return None | |
def extract_makrdown(text): | |
m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE) | |
if m: | |
return m.group(1).strip() | |
else: | |
return text | |
openai_api_key = "EMPTY" | |
openai_api_base = os.environ.get("openai_api_base") | |
IP = os.environ.get("IP") | |
PORT = os.environ.get("PORT") | |
Authorization = os.environ.get("Authorization") | |
client = AsyncOpenAI( | |
api_key=openai_api_key, | |
base_url=openai_api_base + "/v1", | |
http_client=httpx.AsyncClient(verify=False) | |
) | |
async def request(messages): | |
chat_completion_from_base64 = await client.chat.completions.create( | |
messages=messages, | |
extra_headers={ | |
"Authorization": f"Bearer {Authorization}" | |
}, | |
model="Qwen2_5VL", | |
max_completion_tokens=4096, | |
stream=True, | |
temperature=0.0, | |
top_p=0.95 | |
) | |
page = "" | |
async for chunk in chat_completion_from_base64: | |
if chunk.choices[0].delta.content: | |
content = chunk.choices[0].delta.content | |
choice = chunk.choices[0] | |
if choice.finish_reason is not None: | |
print(f"end reason = {choice.finish_reason}") | |
break | |
page += content | |
yield content | |
def images_to_pdf(img_paths, pdf_path): | |
if isinstance(img_paths, (str, Path)): | |
img_paths = [img_paths] | |
if not img_paths: | |
raise ValueError("img_paths is empty") | |
images = [] | |
for p in img_paths: | |
p = Path(p) | |
if not p.is_file(): | |
raise FileNotFoundError(p) | |
img = Image.open(p) | |
if img.mode in ("RGBA", "P"): | |
img = img.convert("RGB") | |
images.append(img) | |
pdf_path = Path(pdf_path) | |
pdf_path.parent.mkdir(parents=True, exist_ok=True) | |
images[0].save(pdf_path, | |
save_all=True, | |
append_images=images[1:], | |
resolution=300.0) | |
return pdf_path | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode("utf-8") | |
def build_message(image_path, prompt): | |
content = [ | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{encode_image(image_path)}" | |
} | |
}, | |
{"type": "text", 'text': prompt} | |
] | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{'role': 'user', 'content': content} | |
] | |
return messages | |
def download_markdown_file(md_text): | |
filename = f"markdown_{uuid.uuid4().hex[:8]}.md" | |
filepath = Path("downloads") / filename | |
filepath.parent.mkdir(exist_ok=True) | |
with open(filepath, "w", encoding="utf-8") as f: | |
f.write(md_text) | |
return str(filepath) | |
async def doc_parser(doc_path, prompt): | |
doc_path = Path(doc_path) | |
if not doc_path.is_file(): | |
raise FileNotFoundError(doc_path) | |
with tempfile.TemporaryDirectory() as tmpdir: | |
tmpdir = Path(tmpdir) | |
queries = [] | |
if doc_path.suffix.lower() == ".pdf": | |
pages: List[Image.Image] = convert_from_path(doc_path, dpi=300) | |
for idx, page in enumerate(pages, start=1): | |
img_path = tmpdir / f"page_{idx}.png" | |
page.save(img_path, "PNG") | |
messages = build_message(img_path, prompt) | |
queries.append(messages) | |
else: | |
messages = build_message(doc_path, prompt) | |
queries.append(messages) | |
all_pages = [] | |
all_pages_raw = [] | |
for query in queries: | |
pages = "" | |
async for chunk in request(query): | |
pages += chunk | |
yield extract_makrdown(pages), pages | |
all_pages.append(extract_makrdown(pages)) | |
all_pages_raw.append(pages) | |
print(all_pages) | |
yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw) | |
def compress_directory_to_zip(directory_path, output_zip_path): | |
try: | |
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, dirs, files in os.walk(directory_path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
arcname = os.path.relpath(file_path, directory_path) | |
zipf.write(file_path, arcname) | |
return 0 | |
except Exception as e: | |
logger.exception(e) | |
return -1 | |
latex_delimiters = [ | |
{'left': '$$', 'right': '$$', 'display': True}, | |
{'left': '$', 'right': '$', 'display': False}, | |
{'left': '\\(', 'right': '\\)', 'display': False}, | |
{'left': '\\[', 'right': '\\]', 'display': True}, | |
] | |
def check_prompt(prompt): | |
if not prompt or prompt.strip() == "": | |
raise gr.Error("Please select or enter a prompt before parsing.") | |
return prompt | |
def to_file(image_path): | |
if image_path.endswith("Academic_Papers.png"): | |
image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf") | |
return image_path | |
def render_img(b64_list, idx, scale): | |
"""根据当前索引 idx 和缩放倍数 scale 渲染 HTML。""" | |
if not b64_list: | |
return "<p style='color:gray'>请先上传图片</p>" | |
idx %= len(b64_list) | |
src = b64_list[idx] | |
# return ( | |
# f'<div style="overflow:auto;border:1px solid #ccc;' | |
# f'display:flex;justify-content:center;align-items:center;' # ① 横纵向居中 | |
# f'width:100%;height:800px;">' # ② 容器尺寸 | |
# f'<img src="{src}" ' | |
# f'style="transform:scale({scale});transform-origin:center center;" />' # ③ 以中心缩放 | |
# f'</div>' | |
# ) | |
# 以百分比形式设置 width,height 自动等比 | |
percent = scale * 100 | |
if scale <= 1: | |
# ---------- 居中模式 ---------- | |
return f""" | |
<div style=" | |
width:100%; | |
height:800px; | |
overflow:auto; | |
border:1px solid #ccc; | |
"> | |
<div style=" | |
min-width:100%; /* 保证外层 div 至少跟容器一样宽 */ | |
display:flex; | |
justify-content:center; /* 小图水平居中 */ | |
"> | |
<img src="{src}" style=" | |
width:{percent}%; | |
height:auto; | |
display:block; | |
"> | |
</div> | |
</div> | |
""" | |
else: | |
# ---------- 放大模式 ---------- | |
return ( | |
f'<div style="overflow:auto;border:1px solid #ccc;' | |
f'width:100%;height:800px;">' | |
f' <img src="{src}" ' | |
f' style="width:{percent}%;max-width:none;' | |
f' height:auto;display:block;" />' | |
f'</div>' | |
) | |
def files_to_b64(file, pdf_dpi: int = 200): | |
out: list[str] = [] | |
if hasattr(file, "data"): | |
raw_bytes = file.data | |
suffix = pathlib.Path(file.name).suffix.lower() | |
# -- PDF -- | |
if suffix == ".pdf": | |
pages = convert_from_bytes(raw_bytes, dpi=pdf_dpi) | |
for page in pages: | |
buf = BytesIO() | |
page.save(buf, format="PNG") | |
b64 = base64.b64encode(buf.getvalue()).decode() | |
out.append(f"data:image/png;base64,{b64}") | |
else: | |
b64 = base64.b64encode(raw_bytes).decode() | |
out.append(f"data:image/{suffix[1:]};base64,{b64}") | |
else: | |
path = pathlib.Path(file) | |
suffix = path.suffix.lower() | |
if suffix == ".pdf": | |
pages = convert_from_path(str(path), dpi=pdf_dpi) | |
for page in pages: | |
buf = BytesIO() | |
page.save(buf, format="PNG") | |
b64 = base64.b64encode(buf.getvalue()).decode() | |
out.append(f"data:image/png;base64,{b64}") | |
else: | |
raw_bytes = path.read_bytes() | |
b64 = base64.b64encode(raw_bytes).decode() | |
out.append(f"data:image/{suffix[1:]};base64,{b64}") | |
return out | |
async def process_file(file_path): | |
"""使用asyncio的异步方案""" | |
if file_path is None: | |
return None | |
if not file_path.endswith(".pdf"): | |
tmp_file_path = Path(file_path) | |
tmp_file_path = tmp_file_path.with_suffix(".pdf") | |
images_to_pdf(file_path, tmp_file_path) | |
else: | |
tmp_file_path = file_path | |
asyncio.create_task(send_pdf_async_aiohttp(tmp_file_path, server_ip=openai_api_base, Authorization=Authorization)) | |
return str(tmp_file_path) | |
if __name__ == '__main__': | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(variant='panel', scale=5): | |
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath") | |
prompts = gr.Dropdown( | |
choices=preset_prompts, | |
label="Prompt", | |
info="Enter or select prompts...", | |
value=preset_prompts[0], | |
multiselect=False, | |
interactive=True, | |
allow_custom_value=True, | |
) | |
with gr.Row(): | |
change_bu = gr.Button('Parse') | |
clear_bu = gr.ClearButton(value='Clear') | |
zoom = gr.Slider(0.5, 3, value=1, step=0.1, label="Image Scale") | |
with gr.Row(): | |
prev_btn = gr.Button("⬅️ Pre") | |
next_btn = gr.Button("Next ➡️") | |
viewer = gr.HTML() | |
example_root = os.path.join(os.path.dirname(__file__), 'examples') | |
images = [ | |
os.path.join(example_root, f) | |
for f in os.listdir(example_root) | |
if f.lower().endswith(('png', 'jpg', 'jpeg')) | |
] | |
with gr.Column(variant='panel', scale=5): | |
with gr.Accordion("Examples", open=True): | |
example_root = "examples" | |
file_path = [ | |
os.path.join(example_root, f) | |
for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"] | |
] | |
with gr.Row(): | |
for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]): | |
with gr.Column(scale=1, min_width=120): | |
gr.Image( | |
value=file_path[i], | |
width=120, | |
height=90, | |
show_label=False, | |
show_download_button=False | |
) | |
gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file) | |
download_btn = gr.Button("⬇️ Generate download link", size="sm") | |
output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False) | |
gr.HTML(""" | |
<style> | |
#down-file-box { | |
max-height: 300px; | |
} | |
</style> | |
""") | |
with gr.Tabs(): | |
with gr.Tab('Markdown rendering'): | |
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, | |
latex_delimiters=latex_delimiters, | |
line_breaks=True) | |
with gr.Tab('Markdown text'): | |
md_text = gr.TextArea(lines=45, show_copy_button=True) | |
img_list_state = gr.State([]) | |
idx_state = gr.State(0) | |
async def upload_handler(files): | |
if files is None: | |
return [], 0, "" | |
if files.lower().endswith(".pdf"): | |
asyncio.create_task(send_pdf_async_aiohttp(files, server_ip=openai_api_base, Authorization=Authorization)) | |
b64s = files_to_b64(files) | |
return b64s, 0, render_img(b64s, 0, 1) | |
file.change( | |
upload_handler, | |
inputs=file, | |
outputs=[img_list_state, idx_state, viewer], | |
).then( | |
lambda: gr.update(value=1), # 无输入,直接把 zoom 设为 1 | |
None, # inputs=None | |
zoom # outputs=[zoom] | |
) | |
def show_prev(b64s, idx, scale): | |
idx -= 1 | |
return idx, render_img(b64s, idx, scale) | |
prev_btn.click( | |
show_prev, | |
inputs=[img_list_state, idx_state, zoom], | |
outputs=[idx_state, viewer], | |
) | |
def show_next(b64s, idx, scale): | |
idx += 1 | |
return idx, render_img(b64s, idx, scale) | |
next_btn.click( | |
show_next, | |
inputs=[img_list_state, idx_state, zoom], | |
outputs=[idx_state, viewer], | |
) | |
zoom.change( | |
lambda b64s, idx, scale: render_img(b64s, idx, scale), | |
inputs=[img_list_state, idx_state, zoom], | |
outputs=viewer, | |
) | |
change_bu.click( | |
fn=check_prompt, | |
inputs=prompts, | |
outputs=prompts | |
).then( | |
lambda f: gr.update(visible=False), | |
inputs=output_file, | |
outputs=output_file | |
).then( | |
fn=doc_parser, | |
inputs=[file, prompts], | |
outputs=[md, md_text] | |
) | |
clear_bu.add([file, md, md_text]) | |
download_btn.click( | |
fn=download_markdown_file, | |
inputs=md_text, | |
outputs=output_file | |
).then( | |
lambda f: gr.update(visible=True), | |
inputs=output_file, | |
outputs=output_file | |
) | |
demo.launch(server_name='0.0.0.0',share=True) |