Spaces:

wfranco
/

abstract-summary

Runtime error

App Files Files Community

wfranco commited on Dec 12, 2023

Commit

9e4f7af

1 Parent(s): f9d18db

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -1

app.py CHANGED Viewed

@@ -8,4 +8,134 @@ iface.launch()
 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+###Installing the packages###
+!pip install PyPDF2
+#PyPDF2: To read the PDF file from the repository path.
+!pip install pdfminer.six
+#Pdfplumber: To identify tables in a PDF page and extract the information from them.
+!pip install pdfplumber
+#Pdf2image: To convert the cropped PDF image to a PNG image.
+!pip install pdf2image
+#PIL: To read the PNG image.
+!pip install Pillow
+#Pytesseract: To extract the text from the images using OCR technology.
+!pip install pytesseract
+#Other libraries
+!apt-get install poppler-utils
+!apt install tesseract-ocr
+!apt install libtesseract-dev
+###Importing libraries ###
+# To read the PDF
+import PyPDF2
+# To analyze the PDF layout and extract text
+from pdfminer.high_level import extract_pages, extract_text
+from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
+# To extract text from tables in PDF
+import pdfplumber
+# To extract the images from the PDFs
+from PIL import Image
+from pdf2image import convert_from_path
+# To perform OCR to extract text from images
+import pytesseract
+# To remove the additional created files
+import os
+### Create a function to extract text ###
+def text_extraction(element):
+    # Extracting the text from the in-line text element
+    line_text = element.get_text()
+    # Find the formats of the text
+    # Initialize the list with all the formats that appeared in the line of text
+    line_formats = []
+    for text_line in element:
+        if isinstance(text_line, LTTextContainer):
+            # Iterating through each character in the line of text
+            for character in text_line:
+                if isinstance(character, LTChar):
+                    # Append the font name of the character
+                    line_formats.append(character.fontname)
+                    # Append the font size of the character
+                    line_formats.append(character.size)
+    # Find the unique font sizes and names in the line
+    format_per_line = list(set(line_formats))
+    # Return a tuple with the text in each line along with its format
+    return (line_text, format_per_line)
+### Step 4: Create a function that reads a PDF file ###
+def read_pdf(pdf_path):
+  # create a PDF file object
+  pdfFileObj = open(pdf_path, 'rb')
+  # create a PDF reader object
+  pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+  # Create the dictionary to extract text from each image
+  text_per_page = {}
+  # We extract the pages from the PDF
+  for pagenum, page in enumerate(extract_pages(pdf_path)):
+      print("Elaborating Page_" +str(pagenum))
+      # Initialize the variables needed for the text extraction from the page
+      pageObj = pdfReaded.pages[pagenum]
+      page_text = []
+      line_format = []
+      text_from_images = []
+      text_from_tables = []
+      page_content = []
+      # Initialize the number of the examined tables
+      table_num = 0
+      first_element= True
+      table_extraction_flag= False
+      # Open the pdf file
+      pdf = pdfplumber.open(pdf_path)
+      # Find the examined page
+      page_tables = pdf.pages[pagenum]
+      # Find the number of tables on the page
+      tables = page_tables.find_tables()
+      # Find all the elements
+      page_elements = [(element.y1, element) for element in page._objs]
+      # Sort all the elements as they appear in the page
+      page_elements.sort(key=lambda a: a[0], reverse=True)
+      # Find the elements that composed a page
+      for i,component in enumerate(page_elements):
+          # Extract the position of the top side of the element in the PDF
+          pos= component[0]
+          # Extract the element of the page layout
+          element = component[1]
+          # Check if the element is a text element
+          if isinstance(element, LTTextContainer):
+              # Check if the text appeared in a table
+              if table_extraction_flag == False:
+                  # Use the function to extract the text and format for each text element
+                  (line_text, format_per_line) = text_extraction(element)
+                  # Append the text of each line to the page text
+                  page_text.append(line_text)
+                  # Append the format for each line containing text
+                  line_format.append(format_per_line)
+                  page_content.append(line_text)
+              else:
+                  # Omit the text that appeared in a table
+                  pass
+# Create the key of the dictionary
+      dctkey = 'Page_'+str(pagenum)
+# Add the list of list as the value of the page key
+      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+  # Closing the pdf file object
+      pdfFileObj.close()
+  return text_per_page