Spaces:
Sleeping
Sleeping
6QpgfMkKTVwUug
commited on
Commit
·
a49df92
1
Parent(s):
fd48f46
Final Version
Browse files
app.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import tabula
|
4 |
-
import
|
5 |
import re
|
6 |
import numpy as np
|
7 |
-
import tempfile
|
8 |
import os
|
|
|
|
|
9 |
|
10 |
-
def pdf_processing(
|
11 |
-
def extract_tables_from_pdf(
|
12 |
# Open the PDF file in read-binary mode
|
13 |
-
with open(
|
14 |
# Create a PDF reader object
|
15 |
-
pdf_reader =
|
16 |
|
17 |
# Initialize a list to store the extracted tables
|
18 |
tables = []
|
@@ -20,7 +21,7 @@ def pdf_processing(temp_file_path):
|
|
20 |
# Iterate over each page in the PDF
|
21 |
for page_number in range(len(pdf_reader.pages)):
|
22 |
# Extract the page as a DataFrame using tabula-py
|
23 |
-
df = tabula.read_pdf(
|
24 |
|
25 |
# Append the extracted DataFrame to the tables list
|
26 |
tables.append(df)
|
@@ -62,7 +63,7 @@ def pdf_processing(temp_file_path):
|
|
62 |
}
|
63 |
|
64 |
# Example usage
|
65 |
-
extracted_tables = extract_tables_from_pdf(
|
66 |
|
67 |
# Create a new DataFrame
|
68 |
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
|
@@ -133,7 +134,8 @@ def pdf_processing(temp_file_path):
|
|
133 |
return column_sum, np.round(wm, 2), final_df, csv_save_path
|
134 |
|
135 |
# Define the Gradio interface
|
136 |
-
inputs = gr.inputs.
|
|
|
137 |
|
138 |
outputs = [
|
139 |
gr.outputs.Textbox(label="Total ECTS"),
|
@@ -149,5 +151,5 @@ iface = gr.Interface(
|
|
149 |
)
|
150 |
|
151 |
# Launch the Gradio interface
|
152 |
-
iface.launch(share=
|
153 |
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import tabula
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
import re
|
6 |
import numpy as np
|
|
|
7 |
import os
|
8 |
+
import tempfile
|
9 |
+
import shutil
|
10 |
|
11 |
+
def pdf_processing(pdf_file):
|
12 |
+
def extract_tables_from_pdf(pdf_file):
|
13 |
# Open the PDF file in read-binary mode
|
14 |
+
with open(pdf_file.name, 'rb') as file:
|
15 |
# Create a PDF reader object
|
16 |
+
pdf_reader = PdfReader(file)
|
17 |
|
18 |
# Initialize a list to store the extracted tables
|
19 |
tables = []
|
|
|
21 |
# Iterate over each page in the PDF
|
22 |
for page_number in range(len(pdf_reader.pages)):
|
23 |
# Extract the page as a DataFrame using tabula-py
|
24 |
+
df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)
|
25 |
|
26 |
# Append the extracted DataFrame to the tables list
|
27 |
tables.append(df)
|
|
|
63 |
}
|
64 |
|
65 |
# Example usage
|
66 |
+
extracted_tables = extract_tables_from_pdf(pdf_file)
|
67 |
|
68 |
# Create a new DataFrame
|
69 |
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
|
|
|
134 |
return column_sum, np.round(wm, 2), final_df, csv_save_path
|
135 |
|
136 |
# Define the Gradio interface
|
137 |
+
inputs = gr.inputs.File(label="Select PDF file", type="file")
|
138 |
+
#inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
|
139 |
|
140 |
outputs = [
|
141 |
gr.outputs.Textbox(label="Total ECTS"),
|
|
|
151 |
)
|
152 |
|
153 |
# Launch the Gradio interface
|
154 |
+
iface.launch(share=False)
|
155 |
|