Spaces:

vsrinivas
/

PDF_to_IPython_NoteBook

Sleeping

File size: 6,579 Bytes

import gradio as gr
import ast
from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell)
import pdfplumber
import google.generativeai as genai
import nbformat


def classify_page(statement):
    genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg')
    
    # Create the model
    generation_config = {
      "temperature": 0,
      "max_output_tokens": 8192,
      "response_mime_type": "text/plain",  
    }
    
    model = genai.GenerativeModel(
      model_name="gemini-1.5-flash-002",
      generation_config=generation_config,
    )
    
    chat_session = model.start_chat(
      history=[
      ]
    )

    prompt = f"""
    Group the following "Input" strings as substring blocks of "Code" or "Text". 
    The response content shall be strictly just a sequence of Python touples where the first element of each touple  either "Code" or "Text" and the second elemnt is the coressponding grouped substring block.
    
    Input:
    # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. 

    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.

    # First, we start with the loading the required packages.
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import requests

    Then we access the website link, read the web page content and do some pre-processing. 

    fig, ax = plt.subplots()
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # Create a twin Axes object that shares the x-axis
    ax2 = ax.twinx()

    # Plot the new cumulative cases time-series in green
    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, 
                "green", "Date" , "Cumulative no. confirmed of cases")

    # Plot the new cumulative deaths data in green
    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, 
    "orange", "Date" , "Cumulative no. of deaths")

    # Plot the new daily cases time-series in blue
    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")

    response_content: 
    [("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. 

    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.

    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.),
    ("Code", # First, we start with the loading the required packages.
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    import requests),
    ("Text", Then we access the website link, read the web page content and do some pre-processing.),
    ("Code", fig, ax = plt.subplots()
    ax.get_yaxis().get_major_formatter().set_scientific(False)

    # Create a twin Axes object that shares the x-axis
    ax2 = ax.twinx()

    # Plot the new cumulative cases time-series in green
    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, 
                "green", "Date" , "Cumulative no. confirmed of cases")

    # Plot the new cumulative deaths data in green
    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, 
    "orange", "Date" , "Cumulative no. of deaths")

    # Plot the new daily cases time-series in blue
    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")]
    
    Now, classify this string:
    Input: {statement}
    """
    response = chat_session.send_message(prompt)
    # print(response.text.replace("```python\n", "").replace("```", "").strip())
    return ast.literal_eval(response.text.replace("```python\n", "").replace("```", "").strip())

def create_notebook(file, tc, bc):
    notebook = new_notebook()
    with pdfplumber.open(file) as pdf:
        for p, page in enumerate(pdf.pages):
        # Extract the text from the PDF
            width, height = page.width, page.height
            top_crop = tc  # Height of the header to exclude
            bottom_crop = bc  # Height of the footer to exclude
       
            crop_box = (0, top_crop, width, height - bottom_crop)
            
            # Crop the page
            cropped_page = page.within_bbox(crop_box)
            text = cropped_page.extract_text()
            # Split the text into lines
            # lines = text.split('\n')
            blocks = classify_page(text)
            # print(blocks)
            for c, value in blocks:
                if c == "Code":
                    notebook.cells.append(new_code_cell(value))
                elif c == "Text":
                    value = value.replace("\n", "\n\n")
                    # notebook.cells.append(new_markdown_cell(value))
                    notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']','')))
                    
            print(f"Page No.{p+1} completed")

    file_path = file.split('.pdf')[0]+'.ipynb'

    # Write the notebook in UTF-8 encoding
    with open(file_path + '.ipynb', 'w', encoding="utf-8") as f:
        nbformat.write(notebook, f)
            
    print(f'{file_path}.ipynb notebook created successfully.')
    return f'{file_path}.ipynb'

    
with gr.Blocks() as app:
    gr.Markdown("""# PDF to IPython Notebook Convertor App
    ## Upload your PDF document containing Python code and text and press Process File button to download the iPython Notebook.
    ### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""")

    file_input = gr.File(label="Upload a PDF file")
    tc = gr.Slider(label='Top Crop in Pixels', value=25)
    bc = gr.Slider(label='Bottom Crop in pixels', value=25)
    
    download_button = gr.File(label="Download processed file")

    process_button = gr.Button("Process File")

    process_button.click(
        fn=create_notebook, 
        inputs=[file_input, tc, bc], 
        outputs=download_button
    )

app.launch(debug=True)