Spaces:

vsrinivas
/

PDF_to_IPython_NoteBook

Sleeping

App Files Files Community

vsrinivas commited on Jan 3

Commit

326f756

verified ·

1 Parent(s): 9e35974

Create app

Browse files

Files changed (1) hide show

app +157 -0

app ADDED Viewed

	@@ -0,0 +1,157 @@

+import gradio as gr
+import ast
+from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell)
+def classify_page(statement):
+    genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg')
+    # Create the model
+    generation_config = {
+      "temperature": 0,
+      "max_output_tokens": 8192,
+      "response_mime_type": "text/plain",
+    }
+    model = genai.GenerativeModel(
+      model_name="gemini-1.5-flash-002",
+      generation_config=generation_config,
+    )
+    chat_session = model.start_chat(
+      history=[
+      ]
+    )
+    prompt = f"""
+    Group the following "Input" strings as substring blocks of "Code" or "Text".
+    The response content shall be strictly just a sequence of Python touples where the first element of each touple  either "Code" or "Text" and the second elemnt is the coressponding grouped substring block.
+    Input:
+    # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.
+    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.
+    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.
+    # First, we start with the loading the required packages.
+    import pandas as pd
+    import numpy as np
+    import matplotlib.pyplot as plt
+    import matplotlib.dates as mdates
+    import requests
+    Then we access the website link, read the web page content and do some pre-processing.
+    fig, ax = plt.subplots()
+    ax.get_yaxis().get_major_formatter().set_scientific(False)
+    # Create a twin Axes object that shares the x-axis
+    ax2 = ax.twinx()
+    # Plot the new cumulative cases time-series in green
+    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
+                "green", "Date" , "Cumulative no. confirmed of cases")
+    # Plot the new cumulative deaths data in green
+    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
+    "orange", "Date" , "Cumulative no. of deaths")
+    # Plot the new daily cases time-series in blue
+    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")
+    response_content:
+    [("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.
+    The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.
+    The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.),
+    ("Code", # First, we start with the loading the required packages.
+    import pandas as pd
+    import numpy as np
+    import matplotlib.pyplot as plt
+    import matplotlib.dates as mdates
+    import requests),
+    ("Text", Then we access the website link, read the web page content and do some pre-processing.),
+    ("Code", fig, ax = plt.subplots()
+    ax.get_yaxis().get_major_formatter().set_scientific(False)
+    # Create a twin Axes object that shares the x-axis
+    ax2 = ax.twinx()
+    # Plot the new cumulative cases time-series in green
+    plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
+                "green", "Date" , "Cumulative no. confirmed of cases")
+    # Plot the new cumulative deaths data in green
+    plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
+    "orange", "Date" , "Cumulative no. of deaths")
+    # Plot the new daily cases time-series in blue
+    plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")]
+    Now, classify this string:
+    Input: {statement}
+    """
+    response = chat_session.send_message(prompt)
+    print(response.text.replace("```python\n", "").replace("```", "").strip())
+    return ast.literal_eval(response.text.replace("```python\n", "").replace("```", "").strip())
+def create_notebook(file, tc, bc):
+    notebook = new_notebook()
+    with pdfplumber.open(file) as pdf:
+        for p, page in enumerate(pdf.pages):
+        # Extract the text from the PDF
+            width, height = page.width, page.height
+            top_crop = tc  # Height of the header to exclude
+            bottom_crop = bc  # Height of the footer to exclude
+            crop_box = (0, top_crop, width, height - bottom_crop)
+            # Crop the page
+            cropped_page = page.within_bbox(crop_box)
+            text = cropped_page.extract_text()
+            # Split the text into lines
+            # lines = text.split('\n')
+            blocks = classify_page(text)
+            # print(blocks)
+            for c, value in blocks:
+                print(c)
+                print(value)
+                if c == "Code":
+                    notebook.cells.append(new_code_cell(value))
+                elif c == "Text":
+                    value = value.replace("\n", "\n\n")
+                    # notebook.cells.append(new_markdown_cell(value))
+                    notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']','')))
+            print(f"Page No.{p+1} completed")
+    file_path = 'your_ipynb_nOtebook_file'
+    # Write the notebook in UTF-8 encoding
+    with open(file_path + '.ipynb', 'w', encoding="utf-8") as f:
+        nbformat.write(notebook, f)
+    print(f'{file_path}.ipynb notebook created successfully.')
+    return f'{file_path}.ipynb'
+with gr.Blocks() as app:
+    gr.Markdown("## File Processor App")
+    file_input = gr.File(label="Upload a PDF file")
+    tc = gr.Slider(label='Top Crop in Pixels', value=25)
+    bc = gr.Slider(label='Bottom Crop in pixels', value=25)
+    download_button = gr.File(label="Download processed file")
+    process_button = gr.Button("Process File")
+    process_button.click(
+        fn=create_notebook,
+        inputs=[file_input, tc, bc],
+        outputs=download_button
+    )
+app.launch(debug=True)