Spaces:

wfranco
/

abstract-summary

Runtime error

File size: 5,043 Bytes

#!pip install gradio
import gradio as gr
def read_pdf(pdf_path):
    # create a PDF file object
    pdfFileObj = open(pdf_path, 'rb')
    # create a PDF reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj)

    # Create the dictionary to extract text from each page
    text_per_page = {}
    # We extract the pages from the PDF
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        # Initialize the variables needed for the text extraction from the page
        pageObj = pdfReader.pages[pagenum]
        page_text = []
        line_format = []
        text_from_images = []
        text_from_tables = []
        page_content = []
        # Initialize the number of the examined tables
        table_num = 0
        first_element= True
        table_extraction_flag= False
        # Open the pdf file
        pdf = pdfplumber.open(pdf_path)
        # Find the examined page
        page_tables = pdf.pages[pagenum]
        # Find the number of tables on the page
        tables = page_tables.find_tables()

        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the elements as they appear in the page
        page_elements.sort(key=lambda a: a[0], reverse=True)

        # Find the elements that composed a page
        for i, component in enumerate(page_elements):
            # Extract the position of the top side of the element in the PDF
            pos = component[0]
            # Extract the element of the page layout
            element = component[1]

            # Check if the element is a text element
            if isinstance(element, LTTextContainer):
                # Check if the text appeared in a table
                if table_extraction_flag == False:
                    # Use the function to extract the text and format for each text element
                    (line_text, format_per_line) = text_extraction(element)
                    # Append the text of each line to the page text
                    page_text.append(line_text)
                    # Append the format for each line containing text
                    line_format.append(format_per_line)
                    page_content.append(line_text)
                else:
                    # Omit the text that appeared in a table
                    pass

        # Create the key of the dictionary
        dctkey = 'Page_'+str(pagenum)
        # Add the list of list as the value of the page key
        text_per_page[dctkey] = [page_text, line_format, text_from_images, text_from_tables, page_content]

        # Closing the pdf file object
        pdfFileObj.close()
    return text_per_page
pdf_path = '/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf'

text_per_page = read_pdf(pdf_path)

Page_0 = text_per_page['Page_0']

def nested_list_to_string(nested_list):
    result = ''
    for element in nested_list:
        if isinstance(element, list):  # Check if the element is a list
            result += nested_list_to_string(element)  # Recursively process the list
        elif isinstance(element, str):  # Check if the element is a string
            result += element  # Append the string to the result
    return result

Page_0 = text_per_page['Page_0']
string_result = nested_list_to_string(Page_0)

def extract_abstract(page_0):
    def nested_list_to_string(nested_list):
        result = ''
        for element in nested_list:
            if isinstance(element, list):  # Check if the element is a list
                result += nested_list_to_string(element)  # Recursively process the list
            elif isinstance(element, str):  # Check if the element is a string
                result += element  # Append the string to the result
        return result

    # Convert the nested list into a single string
    full_text = nested_list_to_string(page_0)

    # Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
    start_index = full_text.find('Abstract')
    end_index = full_text.find('Introduction')

    # If both 'Abstract' and 'Introduction' are found, extract the text in between
    if start_index != -1 and end_index != -1:
        # Extract the text and remove the word 'Abstract'
        abstract_text = full_text[start_index + len('Abstract'):end_index]
        return abstract_text.strip()
    else:
        return "Abstract or Introduction section not found."

# Example usage
Page_0 = text_per_page['Page_0']
abstract_text = extract_abstract(Page_0)

wall_of_text = abstract_text

result = summarizer(
    wall_of_text,
    min_length=1,
    max_length=30,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    repetition_penalty=3.5,
    num_beams=4,
    early_stopping=True,
)

# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
summary_string = result[0]['summary_text']

print(summary_string)

app =  gra.Interface(fn = user_greeting, inputs=summary_string, outputs=summary_string)
app.launch()