euler314 commited on
Commit
683fa93
·
verified ·
1 Parent(s): b54b02c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -177
app.py CHANGED
@@ -1,177 +1,56 @@
1
- """
2
- Streamlit Universal File-Format Converter
3
- ----------------------------------------
4
- A Streamlit app for Hugging Face Spaces that **actually converts** file
5
- contents across a wide array of formats, leveraging local libraries
6
- (no API keys needed):
7
-
8
- **Images** via Pillow (JPEG, PNG, GIF, BMP, TIFF, ICO, WebP)
9
- **Text & markup** via pypandoc (MD, HTML, LaTeX, DOCX, PDF, etc.)
10
- **Office docs** via unoconv + LibreOffice headless (PDF, DOCX, PPTX, XLSX)
11
- **Audio/video** via ffmpeg-python (MP3, WAV, MP4, AVI, MKV, MOV, etc.)
12
- **MIME detection** via python-magic
13
-
14
- Disallowed uploads: `.exe`, `.bin`
15
- All outputs are streamed into a ZIP for download.
16
-
17
- Created 2025-05-22 • v3
18
- """
19
- from __future__ import annotations
20
-
21
- # Set up a writable Streamlit home BEFORE importing streamlit
22
- import os, pathlib
23
- os.environ.setdefault("STREAMLIT_HOME", "/tmp/.streamlit")
24
- os.environ.setdefault("HOME", "/tmp")
25
- pathlib.Path(os.environ["STREAMLIT_HOME"]).mkdir(parents=True, exist_ok=True)
26
-
27
- import io
28
- import zipfile
29
- import tempfile
30
- import subprocess
31
- from datetime import datetime
32
- from pathlib import Path
33
-
34
- import streamlit as st
35
- from PIL import Image
36
- import pypandoc
37
- import ffmpeg
38
- import magic # python-magic for mime detection
39
-
40
- # -----------------------------------------------------------------------------
41
- # Supported extensions
42
- # -----------------------------------------------------------------------------
43
- IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".ico", ".webp"}
44
- TEXT_EXTS = {".txt", ".md", ".csv", ".json", ".xml", ".html", ".css", ".js"}
45
- MEDIA_EXTS = {".mp3", ".wav", ".mp4", ".avi", ".mkv", ".mov"}
46
- DOC_EXTS = {".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".odt", ".ods"}
47
-
48
- ALLOWED_TARGET_EXTS = sorted(IMAGE_EXTS | TEXT_EXTS | MEDIA_EXTS | DOC_EXTS)
49
- DISALLOWED_SOURCE_EXTS = {".exe", ".bin"}
50
-
51
- # -----------------------------------------------------------------------------
52
- # UI elements
53
- # -----------------------------------------------------------------------------
54
- def sidebar_target_extension() -> str:
55
- st.sidebar.header("Settings")
56
- query = st.sidebar.text_input("Filter extensions… (optional)")
57
- choices = [e for e in ALLOWED_TARGET_EXTS if query.lower() in e]
58
- if not choices:
59
- st.sidebar.error("No extension matches that filter.")
60
- choices = ALLOWED_TARGET_EXTS
61
- return st.sidebar.selectbox(
62
- "Target extension for **all** files", choices, index=choices.index(".pdf") if ".pdf" in choices else 0
63
- )
64
-
65
- def uploader():
66
- return st.file_uploader(
67
- "Upload files to convert", type=None, accept_multiple_files=True
68
- )
69
-
70
- # -----------------------------------------------------------------------------
71
- # Conversion functions
72
- # -----------------------------------------------------------------------------
73
- def convert_image(data: bytes, target_ext: str) -> bytes:
74
- img = Image.open(io.BytesIO(data))
75
- buf = io.BytesIO()
76
- fmt = {".jpg":"JPEG", ".jpeg":"JPEG", ".png":"PNG", ".gif":"GIF",
77
- ".bmp":"BMP", ".tiff":"TIFF", ".ico":"ICO", ".webp":"WEBP"}[target_ext]
78
- img.save(buf, format=fmt)
79
- return buf.getvalue()
80
-
81
-
82
- def convert_text_markup(data: bytes, orig_ext: str, target_ext: str) -> bytes:
83
- text = data.decode("utf-8", errors="ignore")
84
- return pypandoc.convert_text(text, to=target_ext.lstrip('.'), format=orig_ext.lstrip('.')).encode('utf-8')
85
-
86
-
87
- def convert_office(temp_dir: str, data: bytes, orig_ext: str, target_ext: str) -> bytes:
88
- # Use unoconv to convert office files
89
- suffix_in = orig_ext
90
- suffix_out = target_ext
91
- in_path = Path(temp_dir) / f"input{suffix_in}"
92
- out_path = Path(temp_dir) / f"output{suffix_out}"
93
- in_path.write_bytes(data)
94
- subprocess.run(["unoconv", "-f", suffix_out.lstrip('.'), "-o", str(out_path), str(in_path)], check=True)
95
- return out_path.read_bytes()
96
-
97
-
98
- def convert_media(data: bytes, target_ext: str) -> bytes:
99
- # ffmpeg-python streaming
100
- process = (
101
- ffmpeg.input('pipe:0')
102
- .output('pipe:1', format=target_ext.lstrip('.'))
103
- .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
104
- )
105
- out, err = process.communicate(data)
106
- return out
107
-
108
-
109
- def convert_file(file: st.runtime.uploaded_file_manager.UploadedFile, target_ext: str) -> tuple[bytes, str]:
110
- name = Path(file.name)
111
- orig_ext = name.suffix.lower()
112
- raw = file.read()
113
-
114
- if orig_ext in DISALLOWED_SOURCE_EXTS:
115
- raise ValueError(f"Disallowed: {orig_ext}")
116
-
117
- mime = magic.from_buffer(raw, mime=True) or ''
118
-
119
- try:
120
- if orig_ext in IMAGE_EXTS and target_ext in IMAGE_EXTS:
121
- return convert_image(raw, target_ext), "image converted"
122
- if mime.startswith('text/') or orig_ext in TEXT_EXTS:
123
- if orig_ext != target_ext:
124
- return convert_text_markup(raw, orig_ext, target_ext), "text/markup converted"
125
- if orig_ext in DOC_EXTS or target_ext in DOC_EXTS:
126
- with tempfile.TemporaryDirectory() as tmp:
127
- return convert_office(tmp, raw, orig_ext, target_ext), "office/doc converted"
128
- if mime.startswith(('audio/','video/')) or orig_ext in MEDIA_EXTS:
129
- if orig_ext != target_ext:
130
- return convert_media(raw, target_ext), "media converted"
131
- except Exception as e:
132
- st.warning(f"⚠️ Conversion failed for {file.name}: {e}. Falling back to rename.")
133
-
134
- # Fallback: no conversion, just rename
135
- return raw, "renamed only"
136
-
137
- # -----------------------------------------------------------------------------
138
- # ZIP packaging
139
- # -----------------------------------------------------------------------------
140
- def package_zip(files: list[st.runtime.uploaded_file_manager.UploadedFile], target_ext: str) -> io.BytesIO:
141
- buf = io.BytesIO()
142
- with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
143
- for file in files:
144
- name = Path(file.name)
145
- if name.suffix.lower() in DISALLOWED_SOURCE_EXTS:
146
- st.warning(f"Skipping disallowed file: {name.name}")
147
- continue
148
- data, note = convert_file(file, target_ext)
149
- out_name = name.with_suffix(target_ext).name
150
- zf.writestr(out_name, data)
151
- st.success(f"{note}: {name.name} → {out_name}")
152
- buf.seek(0)
153
- return buf
154
-
155
- # -----------------------------------------------------------------------------
156
- # Main
157
- # -----------------------------------------------------------------------------
158
-
159
- def main():
160
- st.set_page_config("Universal Converter", page_icon="🔄", layout="centered")
161
- st.title("🔄 Universal File-Format Converter")
162
- st.write("Upload files of any format; choose a new extension; download a ZIP of converted files.")
163
-
164
- target_ext = sidebar_target_extension()
165
- files = uploader()
166
-
167
- if files and st.button("Convert & Download 🚀"):
168
- zip_buf = package_zip(files, target_ext)
169
- ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
170
- st.download_button("⬇️ Download ZIP", zip_buf,
171
- file_name=f"converted_{ts}.zip",
172
- mime='application/zip')
173
-
174
- st.caption("© 2025 Universal Converter • Streamlit • Hugging Face Spaces")
175
-
176
- if __name__ == '__main__':
177
- main()
 
1
+ # app.py
2
+ import fitz # PyMuPDF
3
+ from markdownify import markdownify as md
4
+ import json
5
+ import gradio as gr
6
+
7
+ def convert_pdf_to_markdown(path):
8
+ """Extract each page as HTML, convert to Markdown."""
9
+ doc = fitz.open(path)
10
+ pages_md = []
11
+ for i, page in enumerate(doc, start=1):
12
+ html = page.get_text("html") or ""
13
+ # Clean conversion: collapse multiple newlines
14
+ page_md = md(html).strip()
15
+ pages_md.append({"page": i, "markdown": page_md})
16
+ return pages_md
17
+
18
+ def process_upload(pdf_file, output_format):
19
+ """
20
+ pdf_file: tempfile-like object from Gradio
21
+ output_format: "markdown" or "json"
22
+ """
23
+ # Convert and collect
24
+ pages = convert_pdf_to_markdown(pdf_file.name)
25
+
26
+ if output_format == "markdown":
27
+ # Join all pages
28
+ full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
29
+ return full_md
30
+ else:
31
+ # Return pretty JSON
32
+ return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
33
+
34
+ # Gradio interface
35
+ demo = gr.Interface(
36
+ fn=process_upload,
37
+ inputs=[
38
+ gr.File(label="Upload your PDF", file_types=[".pdf"]),
39
+ gr.Radio(choices=["markdown", "json"],
40
+ value="markdown",
41
+ label="Output format")
42
+ ],
43
+ outputs=gr.Code(label="Converted Output"),
44
+ title="PDF Markdown/JSON Converter",
45
+ description=(
46
+ "Upload a PDF and get back a professionally converted Markdown "
47
+ "or a structured JSON with each page’s Markdown. "
48
+ "PDFs with images or complex tables may still need manual review."
49
+ ),
50
+ examples=[
51
+ # you can add example PDFs here if desired
52
+ ]
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ demo.launch(server_name="0.0.0.0", server_port=7860)