Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 14, 2024

Commit

96fadd5

verified ·

1 Parent(s): bcae609

updated app.py

Browse files

Files changed (1) hide show

app.py +36 -87

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import fitz
 from io import BytesIO
 from PIL import Image
 import pandas as pd
 def extract_text_images(
         pdf_path: str, output_folder: str,
@@ -42,19 +43,16 @@ def extract_text_images(
         elements = []
         if extraction_type in ('text', 'both'):
-            # Extract text blocks with their positions and font sizes
             text_blocks = page.get_text("dict")["blocks"]
             lines = {}
-            # Group text blocks by their vertical position (top) to form lines
             for block in text_blocks:
-                if block["type"] == 0:  # Text block
                     for line in block["lines"]:
                         for span in line["spans"]:
                             font_size = span["size"]
                             top = span["bbox"][1]
-                            # Skip text blocks with font size less than the minimum
                             if font_size < minimum_font_size:
                                 continue
@@ -62,7 +60,6 @@ def extract_text_images(
                                 lines[top] = []
                             lines[top].append(span)
-            # Process each line
             for top in sorted(lines.keys()):
                 line = lines[top]
                 line_text = " ".join([span['text'] for span in line])
@@ -77,7 +74,6 @@ def extract_text_images(
                 })
         if extraction_type in ('images', 'both'):
-            # Extract images using PyMuPDF
             image_list = page.get_images(full=True)
             for img_index, img in enumerate(image_list):
@@ -92,7 +88,6 @@ def extract_text_images(
                 with open(image_filename, "wb") as img_file:
                     img_file.write(image_bytes)
-                # Get the position of the image
                 img_rect = page.get_image_bbox(img)
                 elements.append({
                     'type': 'image',
@@ -102,10 +97,8 @@ def extract_text_images(
                     'top': img_rect.y0
                 })
-        # Sort elements by their vertical position (top) first, and then by horizontal position (x0)
         elements.sort(key=lambda e: (e['top'], e['x0']))
-        # Process elements to extract content pagewise
         page_content = []
         for element in elements:
             if element['type'] == 'text':
@@ -132,19 +125,6 @@ def extract_text_images(
     return extraction_data
 def convert_to_xlsx(data: dict) -> BytesIO:
-    """
-    Converts the extracted data to an XLSX file.
-    Params
-    -------
-    data: dict
-        The extracted data organized by pages.
-    Returns
-    -------
-    BytesIO
-        The XLSX file in memory.
-    """
     rows = []
     for item in data:
@@ -172,39 +152,20 @@ def convert_to_xlsx(data: dict) -> BytesIO:
     output.seek(0)
     return output
 def main():
     st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
     st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
-    # Sidebar for PDF preview
-    st.markdown(
-    """
-    <style>
-        .sidebar-header {
-            text-align: center;
-            color: blue;
-            padding: 5px 0;
-            font-size:30px;
-            font-weight: bold;
-        }
-    </style>
-    """,
-        unsafe_allow_html=True)
     st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
-    # File uploader
     pdf_file = st.file_uploader("Upload PDF", type="pdf")
     if pdf_file is not None:
-        # Slider to select number of pages to preview
         num_pages_to_preview = st.sidebar.slider(
             "Select number of pages to preview:",
             min_value=1, max_value=5, value=1
         )
-        # Display PDF preview for selected number of pages
         pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
         for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
             page = pdf_document.load_page(page_num)
@@ -212,66 +173,55 @@ def main():
             image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
-    # Extraction type selector
     st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
     extraction_type = st.selectbox(
         "Choose extraction type:",
         ("text", "images", "both")
     )
-    # Minimum font size input
     st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
     minimum_font_size = st.number_input(
         "Minimum font size to extract:",
         min_value=1, value=2
     )
-    # Output folder path input (full path provided by the user)
-    output_folder = st.text_input(
-        "Output folder path:"
-    )
     if st.button("Start Extraction"):
-        if pdf_file is not None and output_folder:
-            # Save uploaded PDF to a temporary location
-            temp_pdf_path = os.path.join(output_folder, pdf_file.name)
-            with open(temp_pdf_path, "wb") as f:
-                f.write(pdf_file.getvalue())
-            # Call the extraction function
-            extraction_data = extract_text_images(
-                temp_pdf_path,
-                output_folder,
-                minimum_font_size,
-                extraction_type
-            )
-            # Display extracted JSON data
-            st.json(extraction_data)
-            # Convert data to XLSX
-            xlsx_data = convert_to_xlsx(extraction_data)
-            col1, col2 = st.columns(2)
-            with col1:
-                st.download_button(
-                    label="Download JSON",
-                    data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
-                    file_name='extraction_data.json',
-                    mime='application/json')
-            with col2:
-                st.download_button(
-                    label="Download XLSX",
-                    data=xlsx_data,
-                    file_name='extraction_data.xlsx',
-                    mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
         else:
-            st.error("Please upload a PDF file and provide an output folder path.")
-    # Footer (Fixed Position)
     st.markdown(
         """
         <style>
@@ -296,6 +246,5 @@ def main():
         unsafe_allow_html=True
     )
 if __name__ == "__main__":
     main()

 from io import BytesIO
 from PIL import Image
 import pandas as pd
+import tempfile
 def extract_text_images(
         pdf_path: str, output_folder: str,
         elements = []
         if extraction_type in ('text', 'both'):
             text_blocks = page.get_text("dict")["blocks"]
             lines = {}
             for block in text_blocks:
+                if block["type"] == 0:
                     for line in block["lines"]:
                         for span in line["spans"]:
                             font_size = span["size"]
                             top = span["bbox"][1]
                             if font_size < minimum_font_size:
                                 continue
                                 lines[top] = []
                             lines[top].append(span)
             for top in sorted(lines.keys()):
                 line = lines[top]
                 line_text = " ".join([span['text'] for span in line])
                 })
         if extraction_type in ('images', 'both'):
             image_list = page.get_images(full=True)
             for img_index, img in enumerate(image_list):
                 with open(image_filename, "wb") as img_file:
                     img_file.write(image_bytes)
                 img_rect = page.get_image_bbox(img)
                 elements.append({
                     'type': 'image',
                     'top': img_rect.y0
                 })
         elements.sort(key=lambda e: (e['top'], e['x0']))
         page_content = []
         for element in elements:
             if element['type'] == 'text':
     return extraction_data
 def convert_to_xlsx(data: dict) -> BytesIO:
     rows = []
     for item in data:
     output.seek(0)
     return output
 def main():
     st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
     st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
     st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
     pdf_file = st.file_uploader("Upload PDF", type="pdf")
     if pdf_file is not None:
         num_pages_to_preview = st.sidebar.slider(
             "Select number of pages to preview:",
             min_value=1, max_value=5, value=1
         )
         pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
         for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
             page = pdf_document.load_page(page_num)
             image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
     st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
     extraction_type = st.selectbox(
         "Choose extraction type:",
         ("text", "images", "both")
     )
     st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
     minimum_font_size = st.number_input(
         "Minimum font size to extract:",
         min_value=1, value=2
     )
     if st.button("Start Extraction"):
+        if pdf_file is not None:
+            with tempfile.TemporaryDirectory() as output_folder:
+                temp_pdf_path = os.path.join(output_folder, pdf_file.name)
+                with open(temp_pdf_path, "wb") as f:
+                    f.write(pdf_file.getvalue())
+                extraction_data = extract_text_images(
+                    temp_pdf_path,
+                    output_folder,
+                    minimum_font_size,
+                    extraction_type
+                )
+                st.json(extraction_data)
+                xlsx_data = convert_to_xlsx(extraction_data)
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.download_button(
+                        label="Download JSON",
+                        data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
+                        file_name='extraction_data.json',
+                        mime='application/json')
+                with col2:
+                    st.download_button(
+                        label="Download XLSX",
+                        data=xlsx_data,
+                        file_name='extraction_data.xlsx',
+                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
         else:
+            st.error("Please upload a PDF file.")
     st.markdown(
         """
         <style>
         unsafe_allow_html=True
     )
 if __name__ == "__main__":
     main()