File size: 6,579 Bytes
326f756
 
 
f238b05
bce9ad3
c1fee3c
326f756
49a2990
326f756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9507243
326f756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c60d2f8
326f756
 
 
 
 
 
 
 
 
 
9507243
0bebc4a
 
326f756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
import ast
from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell)
import pdfplumber
import google.generativeai as genai
import nbformat


def classify_page(statement):
    genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg')
    
    # Create the model
    generation_config = {
      "temperature": 0,
      "max_output_tokens": 8192,
      "response_mime_type": "text/plain",  
    }
    
    model = genai.GenerativeModel(
      model_name="gemini-1.5-flash-002",
      generation_config=generation_config,
    )
    
    chat_session = model.start_chat(
      history=[
      ]
    )

    prompt = f"""
    Group the following "Input" strings as substring blocks of "Code" or "Text". 
    The response content shall be strictly just a sequence of Python touples where the first element of each touple  either "Code" or "Text" and the second elemnt is the coressponding grouped substring block.
    
    Input:
    # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. 

    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.

    # First, we start with the loading the required packages.
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import requests

    Then we access the website link, read the web page content and do some pre-processing. 

    fig, ax = plt.subplots()
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # Create a twin Axes object that shares the x-axis
    ax2 = ax.twinx()

    # Plot the new cumulative cases time-series in green
    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, 
                "green", "Date" , "Cumulative no. confirmed of cases")

    # Plot the new cumulative deaths data in green
    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, 
    "orange", "Date" , "Cumulative no. of deaths")

    # Plot the new daily cases time-series in blue
    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")

    response_content: 
    [("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. 

    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.),
    ("Code", # First, we start with the loading the required packages.
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import requests),
    ("Text", Then we access the website link, read the web page content and do some pre-processing.),
    ("Code", fig, ax = plt.subplots()
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # Create a twin Axes object that shares the x-axis
    ax2 = ax.twinx()

    # Plot the new cumulative cases time-series in green
    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, 
                "green", "Date" , "Cumulative no. confirmed of cases")

    # Plot the new cumulative deaths data in green
    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, 
    "orange", "Date" , "Cumulative no. of deaths")

    # Plot the new daily cases time-series in blue
    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")]
    
    Now, classify this string:
    Input: {statement}
    """
    response = chat_session.send_message(prompt)
    # print(response.text.replace("```python\n", "").replace("```", "").strip())
    return ast.literal_eval(response.text.replace("```python\n", "").replace("```", "").strip())

def create_notebook(file, tc, bc):
    notebook = new_notebook()
    with pdfplumber.open(file) as pdf:
        for p, page in enumerate(pdf.pages):
        # Extract the text from the PDF
            width, height = page.width, page.height
            top_crop = tc  # Height of the header to exclude
            bottom_crop = bc  # Height of the footer to exclude
       
            crop_box = (0, top_crop, width, height - bottom_crop)
            
            # Crop the page
            cropped_page = page.within_bbox(crop_box)
            text = cropped_page.extract_text()
            # Split the text into lines
            # lines = text.split('\n')
            blocks = classify_page(text)
            # print(blocks)
            for c, value in blocks:
                if c == "Code":
                    notebook.cells.append(new_code_cell(value))
                elif c == "Text":
                    value = value.replace("\n", "\n\n")
                    # notebook.cells.append(new_markdown_cell(value))
                    notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']','')))
                    
            print(f"Page No.{p+1} completed")

    file_path = file.split('.pdf')[0]+'.ipynb'

    # Write the notebook in UTF-8 encoding
    with open(file_path + '.ipynb', 'w', encoding="utf-8") as f:
        nbformat.write(notebook, f)
            
    print(f'{file_path}.ipynb notebook created successfully.')
    return f'{file_path}.ipynb'

    
with gr.Blocks() as app:
    gr.Markdown("""# PDF to IPython Notebook Convertor App
    ## Upload your PDF document containing Python code and text and press Process File button to download the iPython Notebook.
    ### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""")

    file_input = gr.File(label="Upload a PDF file")
    tc = gr.Slider(label='Top Crop in Pixels', value=25)
    bc = gr.Slider(label='Bottom Crop in pixels', value=25)
    
    download_button = gr.File(label="Download processed file")

    process_button = gr.Button("Process File")

    process_button.click(
        fn=create_notebook, 
        inputs=[file_input, tc, bc], 
        outputs=download_button
    )

app.launch(debug=True)