File size: 5,043 Bytes
1d37909
9bedcbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96f46f4
9bedcbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96f46f4
 
9bedcbe
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!pip install gradio
import gradio as gr
def read_pdf(pdf_path):
    # create a PDF file object
    pdfFileObj = open(pdf_path, 'rb')
    # create a PDF reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj)

    # Create the dictionary to extract text from each page
    text_per_page = {}
    # We extract the pages from the PDF
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        # Initialize the variables needed for the text extraction from the page
        pageObj = pdfReader.pages[pagenum]
        page_text = []
        line_format = []
        text_from_images = []
        text_from_tables = []
        page_content = []
        # Initialize the number of the examined tables
        table_num = 0
        first_element= True
        table_extraction_flag= False
        # Open the pdf file
        pdf = pdfplumber.open(pdf_path)
        # Find the examined page
        page_tables = pdf.pages[pagenum]
        # Find the number of tables on the page
        tables = page_tables.find_tables()

        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the elements as they appear in the page
        page_elements.sort(key=lambda a: a[0], reverse=True)

        # Find the elements that composed a page
        for i, component in enumerate(page_elements):
            # Extract the position of the top side of the element in the PDF
            pos = component[0]
            # Extract the element of the page layout
            element = component[1]

            # Check if the element is a text element
            if isinstance(element, LTTextContainer):
                # Check if the text appeared in a table
                if table_extraction_flag == False:
                    # Use the function to extract the text and format for each text element
                    (line_text, format_per_line) = text_extraction(element)
                    # Append the text of each line to the page text
                    page_text.append(line_text)
                    # Append the format for each line containing text
                    line_format.append(format_per_line)
                    page_content.append(line_text)
                else:
                    # Omit the text that appeared in a table
                    pass

        # Create the key of the dictionary
        dctkey = 'Page_'+str(pagenum)
        # Add the list of list as the value of the page key
        text_per_page[dctkey] = [page_text, line_format, text_from_images, text_from_tables, page_content]

        # Closing the pdf file object
        pdfFileObj.close()
    return text_per_page
pdf_path = '/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf'

text_per_page = read_pdf(pdf_path)

Page_0 = text_per_page['Page_0']

def nested_list_to_string(nested_list):
    result = ''
    for element in nested_list:
        if isinstance(element, list):  # Check if the element is a list
            result += nested_list_to_string(element)  # Recursively process the list
        elif isinstance(element, str):  # Check if the element is a string
            result += element  # Append the string to the result
    return result

Page_0 = text_per_page['Page_0']
string_result = nested_list_to_string(Page_0)

def extract_abstract(page_0):
    def nested_list_to_string(nested_list):
        result = ''
        for element in nested_list:
            if isinstance(element, list):  # Check if the element is a list
                result += nested_list_to_string(element)  # Recursively process the list
            elif isinstance(element, str):  # Check if the element is a string
                result += element  # Append the string to the result
        return result

    # Convert the nested list into a single string
    full_text = nested_list_to_string(page_0)

    # Find the start of the 'Abstract' section and the end of it (start of 'Introduction')
    start_index = full_text.find('Abstract')
    end_index = full_text.find('Introduction')

    # If both 'Abstract' and 'Introduction' are found, extract the text in between
    if start_index != -1 and end_index != -1:
        # Extract the text and remove the word 'Abstract'
        abstract_text = full_text[start_index + len('Abstract'):end_index]
        return abstract_text.strip()
    else:
        return "Abstract or Introduction section not found."

# Example usage
Page_0 = text_per_page['Page_0']
abstract_text = extract_abstract(Page_0)

wall_of_text = abstract_text

result = summarizer(
    wall_of_text,
    min_length=1,
    max_length=30,
    no_repeat_ngram_size=3,
    encoder_no_repeat_ngram_size=3,
    repetition_penalty=3.5,
    num_beams=4,
    early_stopping=True,
)

# Access the first element of the list (which is the dictionary) and then the value of 'summary_text'
summary_string = result[0]['summary_text']

print(summary_string)

app =  gra.Interface(fn = user_greeting, inputs=summary_string, outputs=summary_string)
app.launch()