Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 14

Commit

c343a33

•

1 Parent(s): 4516170

updated app.py

Browse files

Files changed (1) hide show

app.py +46 -28

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import fitz
 from io import BytesIO
 from PIL import Image
 import pandas as pd
 import tempfile
 def extract_text_images(
@@ -14,28 +15,11 @@ def extract_text_images(
         ) -> dict:
     """
     Extracts text and/or images from a PDF and organizes them by pages.
-    Params
-    -------
-    pdf_path: str
-        Path to the input PDF file.
-    output_folder: str
-        Path to the output folder where extracted data will be saved.
-    minimum_font_size: int
-        Minimum font size below which the text will be ignored.
-    extraction_type: str
-        Type of extraction, either 'text', 'images', or 'both'.
-    Returns
-    -------
-    dict
-        The extracted data organized by pages.
     """
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
     extraction_data = []
     pdf_document = fitz.open(pdf_path)
     for page_number in range(pdf_document.page_count):
@@ -125,6 +109,9 @@ def extract_text_images(
     return extraction_data
 def convert_to_xlsx(data: dict) -> BytesIO:
     rows = []
     for item in data:
@@ -152,12 +139,33 @@ def convert_to_xlsx(data: dict) -> BytesIO:
     output.seek(0)
     return output
 def main():
     st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
     st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
     st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
     pdf_file = st.file_uploader("Upload PDF", type="pdf")
     if pdf_file is not None:
@@ -185,42 +193,52 @@ def main():
         min_value=1, value=2
     )
     if st.button("Start Extraction"):
-        if pdf_file is not None:
-            with tempfile.TemporaryDirectory() as output_folder:
-                temp_pdf_path = os.path.join(output_folder, pdf_file.name)
                 with open(temp_pdf_path, "wb") as f:
                     f.write(pdf_file.getvalue())
                 extraction_data = extract_text_images(
                     temp_pdf_path,
-                    output_folder,
                     minimum_font_size,
                     extraction_type
                 )
                 st.json(extraction_data)
                 xlsx_data = convert_to_xlsx(extraction_data)
                 col1, col2 = st.columns(2)
                 with col1:
                     st.download_button(
                         label="Download JSON",
                         data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
                         file_name='extraction_data.json',
-                        mime='application/json')
                 with col2:
                     st.download_button(
                         label="Download XLSX",
                         data=xlsx_data,
                         file_name='extraction_data.xlsx',
-                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
         else:
-            st.error("Please upload a PDF file.")
     st.markdown(
         """

 from io import BytesIO
 from PIL import Image
 import pandas as pd
+import zipfile
 import tempfile
 def extract_text_images(
         ) -> dict:
     """
     Extracts text and/or images from a PDF and organizes them by pages.
     """
     if not os.path.exists(output_folder):
         os.makedirs(output_folder)
     extraction_data = []
     pdf_document = fitz.open(pdf_path)
     for page_number in range(pdf_document.page_count):
     return extraction_data
 def convert_to_xlsx(data: dict) -> BytesIO:
+    """
+    Converts the extracted data to an XLSX file.
+    """
     rows = []
     for item in data:
     output.seek(0)
     return output
+def create_zip_with_json_and_images(output_folder, extraction_data):
+    """
+    Creates a ZIP file containing both images and JSON data.
+    """
+    zip_buffer = BytesIO()
+    with zipfile.ZipFile(zip_buffer, "w") as zip_file:
+        # Add JSON file
+        json_data = json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8')
+        zip_file.writestr("extraction_data.json", json_data)
+        # Add images
+        for item in extraction_data:
+            for content in item['content']:
+                if content['type'] == 'image':
+                    image_path = content['path']
+                    image_name = os.path.basename(image_path)
+                    zip_file.write(image_path, image_name)
+    zip_buffer.seek(0)
+    return zip_buffer
 def main():
     st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
     st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
     st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
     pdf_file = st.file_uploader("Upload PDF", type="pdf")
     if pdf_file is not None:
         min_value=1, value=2
     )
+    output_folder = st.text_input("Output folder path:")
     if st.button("Start Extraction"):
+        if pdf_file is not None and output_folder:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
                 with open(temp_pdf_path, "wb") as f:
                     f.write(pdf_file.getvalue())
                 extraction_data = extract_text_images(
                     temp_pdf_path,
+                    temp_dir,
                     minimum_font_size,
                     extraction_type
                 )
                 st.json(extraction_data)
+                if extraction_type == 'images' or extraction_type == 'both':
+                    zip_data = create_zip_with_json_and_images(temp_dir, extraction_data)
+                    st.download_button(
+                        label="Download ZIP",
+                        data=zip_data,
+                        file_name='extraction_data.zip',
+                        mime='application/zip'
+                    )
                 xlsx_data = convert_to_xlsx(extraction_data)
                 col1, col2 = st.columns(2)
                 with col1:
                     st.download_button(
                         label="Download JSON",
                         data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
                         file_name='extraction_data.json',
+                        mime='application/json'
+                    )
                 with col2:
                     st.download_button(
                         label="Download XLSX",
                         data=xlsx_data,
                         file_name='extraction_data.xlsx',
+                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+                    )
         else:
+            st.error("Please upload a PDF file and provide an output folder path.")
     st.markdown(
         """