import gradio as gr import ast from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell) import pdfplumber import google.generativeai as genai import nbformat import re def classify_page(statement): genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg') # Create the model generation_config = { "temperature": 0, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( model_name="gemini-1.5-flash-002", generation_config=generation_config, ) chat_session = model.start_chat( history=[ ] ) prompt = f""" Group the following "Input" strings as substring blocks of "Code" or "Text". The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block. Input: # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data. # First, we start with the loading the required packages. import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.dates as mdates import requests Then we access the website link, read the web page content and do some pre-processing. fig, ax = plt.subplots() ax.get_yaxis().get_major_formatter().set_scientific(False) # Create a twin Axes object that shares the x-axis ax2 = ax.twinx() # Plot the new cumulative cases time-series in green plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, "green", "Date" , "Cumulative no. confirmed of cases") # Plot the new cumulative deaths data in green plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, "orange", "Date" , "Cumulative no. of deaths") # Plot the new daily cases time-series in blue plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases") response_content: [("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.), ("Code", # First, we start with the loading the required packages. import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.dates as mdates import requests), ("Text", Then we access the website link, read the web page content and do some pre-processing.), ("Code", fig, ax = plt.subplots() ax.get_yaxis().get_major_formatter().set_scientific(False) # Create a twin Axes object that shares the x-axis ax2 = ax.twinx() # Plot the new cumulative cases time-series in green plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, "green", "Date" , "Cumulative no. confirmed of cases") # Plot the new cumulative deaths data in green plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, "orange", "Date" , "Cumulative no. of deaths") # Plot the new daily cases time-series in blue plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")] Now, classify this string: Input: {statement} """ response = chat_session.send_message(prompt) print(response.text) print(response.text.replace("```python\n", "").replace("```", "").strip()) response = response.text.replace("```python\n", "").replace("```", "").strip() response = re.sub(r"[^\x20-\x7E]", "", response) print(response) return ast.literal_eval(response) def create_notebook(file, tc, bc): notebook = new_notebook() with pdfplumber.open(file) as pdf: for p, page in enumerate(pdf.pages): # Extract the text from the PDF width, height = page.width, page.height top_crop = tc # Height of the header to exclude bottom_crop = bc # Height of the footer to exclude crop_box = (0, top_crop, width, height - bottom_crop) # Crop the page cropped_page = page.within_bbox(crop_box) text = cropped_page.extract_text() if not text: continue # Split the text into lines # lines = text.split('\n') blocks = classify_page(text) # print(blocks) for c, value in blocks: if c == "Code": notebook.cells.append(new_code_cell(value)) elif c == "Text": value = value.replace("\n", "\n\n") # notebook.cells.append(new_markdown_cell(value)) notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']',''))) print(f"Page No.{p+1} completed") file_path = file.split('.pdf')[0]+'.ipynb' # Write the notebook in UTF-8 encoding with open(file_path + '.ipynb', 'w', encoding="utf-8") as f: nbformat.write(notebook, f) print(f'{file_path} notebook created successfully.') return f'{file_path}' with gr.Blocks() as app: gr.Markdown("""# PDF to IPython Notebook Convertor App ## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook. ### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""") file_input = gr.File(label="Upload a PDF file") tc = gr.Slider(label='Top Crop in Pixels', value=25) bc = gr.Slider(label='Bottom Crop in pixels', value=25) download_button = gr.File(label="Download processed file") process_button = gr.Button("Process File") process_button.click( fn=create_notebook, inputs=[file_input, tc, bc], outputs=download_button ) app.launch(debug=True)