Adityadn commited on
Commit
3d40c3e
·
verified ·
1 Parent(s): 929af28

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -21
app.py CHANGED
@@ -1,13 +1,26 @@
1
  import gradio as gr
2
  import pypandoc
3
  import os
4
- # from pdf2docx import Converter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- try: pypandoc.get_pandoc_version()
7
- except OSError: pypandoc.download_pandoc()
8
-
9
  # Daftar format yang didukung
10
- input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]) or [
11
  'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
12
  'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
13
  'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
@@ -25,26 +38,26 @@ output_supported_formats = [data.upper() for data in sorted([
25
  "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
26
  "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
27
  "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
28
- ]) if data not in ['PDF']]
29
 
30
- # def convert_pdf_to_docx(pdf_file):
31
- # """Konversi PDF ke DOCX menggunakan pdf2docx"""
32
- # output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
33
- # cv = Converter(pdf_file.name)
34
- # cv.convert(output_docx, start=0, end=None)
35
- # return output_docx
36
 
37
  def convert_document(doc_file, target_format):
38
  try:
39
  target_format = target_format.lower()
40
 
41
- # # If the file is a PDF, convert it to DOCX first
42
- # if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
43
- # print("Converting PDF to DOCX...")
44
- # doc_file = convert_pdf_to_docx(doc_file) # Pass the file path directly
45
- # print("PDF converted to DOCX.")
46
- # elif hasattr(doc_file, 'name'): # If it's a file-like object
47
- doc_file = doc_file.name # Get the file path from the file-like object
48
 
49
  # Get the base name of the file (without extension)
50
  base_name = os.path.splitext(os.path.basename(doc_file))[0]
@@ -56,7 +69,10 @@ def convert_document(doc_file, target_format):
56
  pypandoc.convert_file(
57
  doc_file,
58
  target_format.lower(), # Convert the format to lowercase
59
- outputfile=output_file
 
 
 
60
  )
61
 
62
  return output_file
@@ -74,4 +90,9 @@ interface = gr.Interface(
74
  title="Document Format Converter",
75
  description="Upload a document and select any target format for conversion.",
76
  css="footer {visibility: hidden}"
77
- )
 
 
 
 
 
 
1
  import gradio as gr
2
  import pypandoc
3
  import os
4
+ from pdf2docx import Converter
5
+
6
+ os.system('sudo apt-get install texlive')
7
+
8
+ def ensure_pandoc_installed():
9
+ try:
10
+ # Periksa apakah pandoc sudah ada
11
+ pypandoc.get_pandoc_version()
12
+ print("Pandoc is already installed and accessible.")
13
+ except OSError:
14
+ # Unduh pandoc jika belum ada
15
+ print("Pandoc not found, downloading...")
16
+ pypandoc.download_pandoc()
17
+ print("Pandoc downloaded successfully.")
18
+
19
+ # Pastikan Pandoc terpasang
20
+ ensure_pandoc_installed()
21
 
 
 
 
22
  # Daftar format yang didukung
23
+ input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [
24
  'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
25
  'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
26
  'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
 
38
  "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
39
  "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
40
  "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
41
+ ])]
42
 
43
+ def convert_pdf_to_docx(pdf_file):
44
+ """Konversi PDF ke DOCX menggunakan pdf2docx"""
45
+ output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
46
+ cv = Converter(pdf_file.name)
47
+ cv.convert(output_docx, start=0, end=None)
48
+ return output_docx
49
 
50
  def convert_document(doc_file, target_format):
51
  try:
52
  target_format = target_format.lower()
53
 
54
+ # If the file is a PDF, convert it to DOCX first
55
+ if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
56
+ print("Converting PDF to DOCX...")
57
+ doc_file = convert_pdf_to_docx(doc_file) # Pass the file path directly
58
+ print("PDF converted to DOCX.")
59
+ elif hasattr(doc_file, 'name'): # If it's a file-like object
60
+ doc_file = doc_file.name # Get the file path from the file-like object
61
 
62
  # Get the base name of the file (without extension)
63
  base_name = os.path.splitext(os.path.basename(doc_file))[0]
 
69
  pypandoc.convert_file(
70
  doc_file,
71
  target_format.lower(), # Convert the format to lowercase
72
+ outputfile=output_file,
73
+ extra_args=['-V geometry:margin=1.5cm',
74
+ # '--pdf-engine=/usr/bin/xelatex',
75
+ '--metadata', 'title="Converted Document by Flowly AI"']
76
  )
77
 
78
  return output_file
 
90
  title="Document Format Converter",
91
  description="Upload a document and select any target format for conversion.",
92
  css="footer {visibility: hidden}"
93
+ )
94
+
95
+ # Jalankan aplikasi
96
+ if __name__ == "__main__":
97
+ interface.launch()
98
+