vsrinivas's picture
Update app.py
9d2fdea verified
import gradio as gr
import ast
from nbformat.v4 import (new_notebook, new_markdown_cell, new_code_cell)
import pdfplumber
import google.generativeai as genai
import nbformat
import re
def classify_page(statement):
genai.configure(api_key='AIzaSyBjGNru-WJoLncbVrErEzJE184MgX1o_Kg')
# Create the model
generation_config = {
"temperature": 0,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-1.5-flash-002",
generation_config=generation_config,
)
chat_session = model.start_chat(
history=[
]
)
prompt = f"""
Group the following "Input" strings as substring blocks of "Code" or "Text".
The response content shall be strictly just a sequence of Python touples where the first element of each touple either "Code" or "Text" and the second elemnt is the coressponding grouped substring block.
Input:
# Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.
# First, we start with the loading the required packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import requests
Then we access the website link, read the web page content and do some pre-processing.
fig, ax = plt.subplots()
ax.get_yaxis().get_major_formatter().set_scientific(False)
# Create a twin Axes object that shares the x-axis
ax2 = ax.twinx()
# Plot the new cumulative cases time-series in green
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
"green", "Date" , "Cumulative no. confirmed of cases")
# Plot the new cumulative deaths data in green
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
"orange", "Date" , "Cumulative no. of deaths")
# Plot the new daily cases time-series in blue
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")
response_content:
[("Text", # Summary Report - Visual Exploration of Data on Covid-19 Outbreak, Worldwide.
The data was scraped from 'worldometer' web site at https://www.worldometers.info/coronavirus/ and the analysis was carried out using 'Python' programming language and various related libraries.
The worldometer web site provides the data more on cumulative basis and therefore, this report and effort also include the process of gathering daily data.),
("Code", # First, we start with the loading the required packages.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import requests),
("Text", Then we access the website link, read the web page content and do some pre-processing.),
("Code", fig, ax = plt.subplots()
ax.get_yaxis().get_major_formatter().set_scientific(False)
# Create a twin Axes object that shares the x-axis
ax2 = ax.twinx()
# Plot the new cumulative cases time-series in green
plot_timeseries(ax, daily_data2.index, daily_data2['NewCases']+MissedOut_NewCases,
"green", "Date" , "Cumulative no. confirmed of cases")
# Plot the new cumulative deaths data in green
plot_timeseries(ax2, daily_data2.index, daily_data2['NewDeaths']+MissedOut_NewDeaths,
"orange", "Date" , "Cumulative no. of deaths")
# Plot the new daily cases time-series in blue
plot_timeseries(ax, daily_data1.index, daily_data1['NewCases'], "blue", "Date" , "Confirmed cases")]
Now, classify this string:
Input: {statement}
"""
response = chat_session.send_message(prompt)
print(response.text)
print(response.text.replace("```python\n", "").replace("```", "").strip())
response = response.text.replace("```python\n", "").replace("```", "").strip()
response = re.sub(r"[^\x20-\x7E]", "", response)
print(response)
return ast.literal_eval(response)
def create_notebook(file, tc, bc):
notebook = new_notebook()
with pdfplumber.open(file) as pdf:
for p, page in enumerate(pdf.pages):
# Extract the text from the PDF
width, height = page.width, page.height
top_crop = tc # Height of the header to exclude
bottom_crop = bc # Height of the footer to exclude
crop_box = (0, top_crop, width, height - bottom_crop)
# Crop the page
cropped_page = page.within_bbox(crop_box)
text = cropped_page.extract_text()
if not text:
continue
# Split the text into lines
# lines = text.split('\n')
blocks = classify_page(text)
# print(blocks)
for c, value in blocks:
if c == "Code":
notebook.cells.append(new_code_cell(value))
elif c == "Text":
value = value.replace("\n", "\n\n")
# notebook.cells.append(new_markdown_cell(value))
notebook.cells.append(new_markdown_cell(value.replace('[[','').replace(']','')))
print(f"Page No.{p+1} completed")
file_path = file.split('.pdf')[0]+'.ipynb'
# Write the notebook in UTF-8 encoding
with open(file_path + '.ipynb', 'w', encoding="utf-8") as f:
nbformat.write(notebook, f)
print(f'{file_path} notebook created successfully.')
return f'{file_path}'
with gr.Blocks() as app:
gr.Markdown("""# PDF to IPython Notebook Convertor App
## Upload your PDF document containing Python code and Text and press 'Process File' button to download the iPython Notebook.
### Adjust Top Crop and Bottom Crop values based on how much of top and bottom design content of your PDF document you want to eliminate.""")
file_input = gr.File(label="Upload a PDF file")
tc = gr.Slider(label='Top Crop in Pixels', value=25)
bc = gr.Slider(label='Bottom Crop in pixels', value=25)
download_button = gr.File(label="Download processed file")
process_button = gr.Button("Process File")
process_button.click(
fn=create_notebook,
inputs=[file_input, tc, bc],
outputs=download_button
)
app.launch(debug=True)