Spaces:
Sleeping
Sleeping
import gradio as gr | |
import ast | |
from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell) | |
import pdfplumber | |
import google.generativeai as genai | |
import nbformat | |
import re | |
def classify_page(statement): | |
genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg') | |
# Create the model | |
generation_config = { | |
"temperature": 0, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="gemini-1.5-flash-002", | |
generation_config=generation_config, | |
) | |
chat_session = model.start_chat( | |
history=[ | |
] | |
) | |
prompt = f""" | |
Group the following "Input" strings as substring blocks of "Code" or "Text". | |
The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block. | |
Input: | |
# Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. | |
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. | |
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data. | |
# First, we start with the loading the required packages. | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.dates as mdates | |
import requests | |
Then we access the website link, read the web page content and do some pre-processing. | |
fig, ax = plt.subplots() | |
ax.get_yaxis().get_major_formatter().set_scientific(False) | |
# Create a twin Axes object that shares the x-axis | |
ax2 = ax.twinx() | |
# Plot the new cumulative cases time-series in green | |
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, | |
"green", "Date" , "Cumulative no. confirmed of cases") | |
# Plot the new cumulative deaths data in green | |
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, | |
"orange", "Date" , "Cumulative no. of deaths") | |
# Plot the new daily cases time-series in blue | |
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases") | |
response_content: | |
[("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide. | |
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries. | |
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.), | |
("Code", # First, we start with the loading the required packages. | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.dates as mdates | |
import requests), | |
("Text", Then we access the website link, read the web page content and do some pre-processing.), | |
("Code", fig, ax = plt.subplots() | |
ax.get_yaxis().get_major_formatter().set_scientific(False) | |
# Create a twin Axes object that shares the x-axis | |
ax2 = ax.twinx() | |
# Plot the new cumulative cases time-series in green | |
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases, | |
"green", "Date" , "Cumulative no. confirmed of cases") | |
# Plot the new cumulative deaths data in green | |
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths, | |
"orange", "Date" , "Cumulative no. of deaths") | |
# Plot the new daily cases time-series in blue | |
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")] | |
Now, classify this string: | |
Input: {statement} | |
""" | |
response = chat_session.send_message(prompt) | |
print(response.text) | |
print(response.text.replace("```python\n", "").replace("```", "").strip()) | |
response = response.text.replace("```python\n", "").replace("```", "").strip() | |
response = re.sub(r"[^\x20-\x7E]", "", response) | |
print(response) | |
return ast.literal_eval(response) | |
def create_notebook(file, tc, bc): | |
notebook = new_notebook() | |
with pdfplumber.open(file) as pdf: | |
for p, page in enumerate(pdf.pages): | |
# Extract the text from the PDF | |
width, height = page.width, page.height | |
top_crop = tc # Height of the header to exclude | |
bottom_crop = bc # Height of the footer to exclude | |
crop_box = (0, top_crop, width, height - bottom_crop) | |
# Crop the page | |
cropped_page = page.within_bbox(crop_box) | |
text = cropped_page.extract_text() | |
if not text: | |
continue | |
# Split the text into lines | |
# lines = text.split('\n') | |
blocks = classify_page(text) | |
# print(blocks) | |
for c, value in blocks: | |
if c == "Code": | |
notebook.cells.append(new_code_cell(value)) | |
elif c == "Text": | |
value = value.replace("\n", "\n\n") | |
# notebook.cells.append(new_markdown_cell(value)) | |
notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']',''))) | |
print(f"Page No.{p+1} completed") | |
file_path = file.split('.pdf')[0]+'.ipynb' | |
# Write the notebook in UTF-8 encoding | |
with open(file_path + '.ipynb', 'w', encoding="utf-8") as f: | |
nbformat.write(notebook, f) | |
print(f'{file_path} notebook created successfully.') | |
return f'{file_path}' | |
with gr.Blocks() as app: | |
gr.Markdown("""# PDF to IPython Notebook Convertor App | |
## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook. | |
### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""") | |
file_input = gr.File(label="Upload a PDF file") | |
tc = gr.Slider(label='Top Crop in Pixels', value=25) | |
bc = gr.Slider(label='Bottom Crop in pixels', value=25) | |
download_button = gr.File(label="Download processed file") | |
process_button = gr.Button("Process File") | |
process_button.click( | |
fn=create_notebook, | |
inputs=[file_input, tc, bc], | |
outputs=download_button | |
) | |
app.launch(debug=True) |