Spaces:

barser65
/

assessment3

Build error

App Files Files Community

barser65 commited on Dec 11, 2023

Commit

ff92222

1 Parent(s): 8f0f72b

Delete myfunct.py

Browse files

Files changed (1) hide show

myfunct.py +0 -141

myfunct.py DELETED Viewed

@@ -1,141 +0,0 @@
-def converti(path):
-    import pip
-    def install(package):
-        if hasattr(pip, 'main'):
-            pip.main(['install', package])
-        else:
-            pip._internal.main(['install', package])
-    install('git+https://github.com/huggingface/transformers.git')
-    install('datasets sentencepiece')
-    install('PyPDF2')
-    install('pdfminer.six')
-    install('pdfplumber')
-    install('poppler-utils')
-    install('tesseract-ocr')
-    install('libtesseract-dev')
-    # To read the PDF
-    import PyPDF2
-    # To analyze the PDF layout and extract text
-    from pdfminer.high_level import extract_pages, extract_text
-    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
-    # To extract text from tables in PDF
-    import pdfplumber
-    # To remove the additional created files
-    import os
-    # Create a function to extract text
-    def text_extraction(element):
-        # Extracting the text from the in-line text element
-        line_text = element.get_text()
-        # Find the formats of the text
-        # Initialize the list with all the formats that appeared in the line of text
-        line_formats = []
-        for text_line in element:
-            if isinstance(text_line, LTTextContainer):
-                # Iterating through each character in the line of text
-                for character in text_line:
-                    if isinstance(character, LTChar):
-                        # Append the font name of the character
-                        line_formats.append(character.fontname)
-                        # Append the font size of the character
-                        line_formats.append(character.size)
-        # Find the unique font sizes and names in the line
-        format_per_line = list(set(line_formats))
-        # Return a tuple with the text in each line along with its format
-        return (line_text, format_per_line)
-    def read_pdf(pdf_path):
-      # create a PDF file object
-      pdfFileObj = open(pdf_path, 'rb')
-      # create a PDF reader object
-      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
-      # Create the dictionary to extract text from each image
-      text_per_page = {}
-      # We extract the pages from the PDF
-      for pagenum, page in enumerate(extract_pages(pdf_path)):
-          print("Elaborating Page_" +str(pagenum))
-          # Initialize the variables needed for the text extraction from the page
-          pageObj = pdfReaded.pages[pagenum]
-          page_text = []
-          line_format = []
-          text_from_images = []
-          text_from_tables = []
-          page_content = []
-          # Initialize the number of the examined tables
-          table_num = 0
-          first_element= True
-          table_extraction_flag= False
-          # Open the pdf file
-          pdf = pdfplumber.open(pdf_path)
-          # Find the examined page
-          page_tables = pdf.pages[pagenum]
-          # Find the number of tables on the page
-          tables = page_tables.find_tables()
-          # Find all the elements
-          page_elements = [(element.y1, element) for element in page._objs]
-          # Sort all the elements as they appear in the page
-          page_elements.sort(key=lambda a: a[0], reverse=True)
-          # Find the elements that composed a page
-          for i,component in enumerate(page_elements):
-              # Extract the position of the top side of the element in the PDF
-              pos= component[0]
-              # Extract the element of the page layout
-              element = component[1]
-              # Check if the element is a text element
-              if isinstance(element, LTTextContainer):
-                  # Check if the text appeared in a table
-                  if table_extraction_flag == False:
-                      # Use the function to extract the text and format for each text element
-                      (line_text, format_per_line) = text_extraction(element)
-                      # Append the text of each line to the page text
-                      page_text.append(line_text)
-                      # Append the format for each line containing text
-                      line_format.append(format_per_line)
-                      page_content.append(line_text)
-                  else:
-                      # Omit the text that appeared in a table
-                      pass
-          # Create the key of the dictionary
-          dctkey = 'Page_'+str(pagenum)
-          # Add the list of list as the value of the page key
-          text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
-      # Closing the pdf file object
-      pdfFileObj.close()
-      return text_per_page
-    from google.colab import drive
-    drive.mount('/content/drive')
-    pdf_path = '/content/drive/MyDrive/' + path
-    text_per_page = read_pdf(pdf_path)
-    abstr = ''
-    while len(abstr) == 0:
-      for par in range(len(text_per_page)):
-        for x in text_per_page['Page_'+str(par)]:
-          mystring = ' '.join(map(str,x))
-          if mystring.find('Abstract\n') > 0:
-            abstr0 = mystring[mystring.find('Abstract\n')+10:]
-            abstr = abstr0[:abstr0.find('1\n')]
-    print(abstr)
-    from transformers import pipeline
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    summary = summarizer(abstr, max_length=56)
-    summary_text = summary[0]['summary_text']
-    return summary_text