Adityadn commited on
Commit
73bb23a
·
verified ·
1 Parent(s): 492542c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pypandoc
3
+ import os
4
+ from pdf2docx import Converter
5
+
6
+ def ensure_pandoc_installed():
7
+ try:
8
+ # Try to access pandoc's version
9
+ pypandoc.get_pandoc_version()
10
+ print("Pandoc is already installed and accessible.")
11
+ except OSError:
12
+ # Attempt to download pandoc
13
+ print("Pandoc not found, downloading...")
14
+ pypandoc.download_pandoc()
15
+ print("Pandoc downloaded successfully.")
16
+
17
+ # Explicitly set the path to the downloaded pandoc if needed
18
+ os.environ['PATH'] += os.pathsep + '/home/user/bin'
19
+ print("Pandoc path added to system environment.")
20
+
21
+ # Pastikan Pandoc terpasang
22
+ # ensure_pandoc_installed()
23
+
24
+ # Daftar format yang didukung
25
+ input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]) or [
26
+ 'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
27
+ 'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
28
+ 'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
29
+ 'MARKDOWN_MMD', 'MARKDOWN_PHPEXTRA', 'MARKDOWN_STRICT', 'MDOC', 'MEDIAWIKI', 'MUSE',
30
+ 'NATIVE', 'ODT', 'OPML', 'ORG', 'PDF', 'POD', 'RIS', 'RST', 'RTF', 'T2T', 'TEXTILE',
31
+ 'TIKIWIKI', 'TSV', 'TWIKI', 'TYPST', 'VIMWIKI'
32
+ ])]
33
+
34
+ output_supported_formats = [data.upper() for data in sorted([
35
+ "ANSI", "ASCIIDOC", "ASCIIDOC_LEGACY", "ASCIIDOCTOR", "BEAMER", "BIBLATEX", "BIBTEX", "CHUNKEDHTML",
36
+ "COMMONMARK", "COMMONMARK_X", "CONTEXT", "CSLJSON", "DJOT", "DOCBOOK", "DOCBOOK4", "DOCBOOK5",
37
+ "DOCX", "DOKUWIKI", "DZSLIDES", "EPUB", "EPUB2", "EPUB3", "FB2", "GFM", "HADDOCK", "HTML",
38
+ "HTML4", "HTML5", "ICML", "IPYNB", "JATS", "JATS_ARCHIVING", "JATS_ARTICLEAUTHORING",
39
+ "JATS_PUBLISHING", "JIRA", "JSON", "LATEX", "MAN", "MARKDOWN", "MARKDOWN_GITHUB",
40
+ "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
41
+ "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
42
+ "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
43
+ ]) if data not in ['PDF']]
44
+
45
+ def convert_pdf_to_docx(pdf_file):
46
+ """Konversi PDF ke DOCX menggunakan pdf2docx"""
47
+ output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
48
+ cv = Converter(pdf_file.name)
49
+ cv.convert(output_docx, start=0, end=None)
50
+ return output_docx
51
+
52
+ def convert_document(doc_file, target_format):
53
+ try:
54
+ target_format = target_format.lower()
55
+
56
+ # If the file is a PDF, convert it to DOCX first
57
+ if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
58
+ print("Converting PDF to DOCX...")
59
+ doc_file = convert_pdf_to_docx(doc_file) # Pass the file path directly
60
+ print("PDF converted to DOCX.")
61
+ elif hasattr(doc_file, 'name'): # If it's a file-like object
62
+ doc_file = doc_file.name # Get the file path from the file-like object
63
+
64
+ # Get the base name of the file (without extension)
65
+ base_name = os.path.splitext(os.path.basename(doc_file))[0]
66
+
67
+ # Output file name
68
+ output_file = f"document_converter_{base_name}.{target_format.lower()}"
69
+
70
+ # Use pypandoc to convert the file
71
+ pypandoc.convert_file(
72
+ doc_file,
73
+ target_format.lower(), # Convert the format to lowercase
74
+ outputfile=output_file
75
+ )
76
+
77
+ return output_file
78
+ except Exception as e:
79
+ return f"Error: {e}"
80
+
81
+ # Antarmuka Gradio dengan tema kustom
82
+ interface = gr.Interface(
83
+ fn=convert_document,
84
+ inputs=[
85
+ gr.File(label=f"Upload Document", file_types=[f'.{ext.lower()}' for ext in input_supported_formats]),
86
+ gr.Dropdown(label="Select Output Format", choices=output_supported_formats)
87
+ ],
88
+ outputs=gr.File(label="Converted Document"),
89
+ title="Document Format Converter",
90
+ description="Upload a document and select any target format for conversion.",
91
+ css="footer {visibility: hidden}"
92
+ )