Spaces:
Sleeping
Sleeping
6QpgfMkKTVwUug
commited on
Commit
·
fd48f46
1
Parent(s):
cbab223
Final Version
Browse files
app.py
CHANGED
@@ -1,21 +1,153 @@
|
|
1 |
import gradio as gr
|
2 |
-
import PyPDF2
|
3 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
#
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
+
import tabula
|
4 |
+
import PyPDF2
|
5 |
+
import re
|
6 |
+
import numpy as np
|
7 |
+
import tempfile
|
8 |
+
import os
|
9 |
+
|
10 |
+
def pdf_processing(temp_file_path):
|
11 |
+
def extract_tables_from_pdf(temp_file_path):
|
12 |
+
# Open the PDF file in read-binary mode
|
13 |
+
with open(temp_file_path, 'rb') as file:
|
14 |
+
# Create a PDF reader object
|
15 |
+
pdf_reader = PyPDF2.PdfReader(temp_file_path)
|
16 |
+
|
17 |
+
# Initialize a list to store the extracted tables
|
18 |
+
tables = []
|
19 |
+
|
20 |
+
# Iterate over each page in the PDF
|
21 |
+
for page_number in range(len(pdf_reader.pages)):
|
22 |
+
# Extract the page as a DataFrame using tabula-py
|
23 |
+
df = tabula.read_pdf(temp_file_path, pages=page_number+1, multiple_tables=True)
|
24 |
+
|
25 |
+
# Append the extracted DataFrame to the tables list
|
26 |
+
tables.append(df)
|
27 |
+
|
28 |
+
return tables
|
29 |
+
|
30 |
+
def extract_substring(input_string):
|
31 |
+
pattern = r'(\d{3})\.[\w]{3}'
|
32 |
+
match = re.search(pattern, input_string)
|
33 |
+
return match.group() if match else ''
|
34 |
|
35 |
+
def extract_information(input_string):
|
36 |
+
# Define regex patterns for extracting information
|
37 |
+
lva_pattern = r'^(.*?)\r'
|
38 |
+
studium_pattern = r'\r(.*?)(?=\d)'
|
39 |
+
lva_nr_pattern = r'(\d{3})\.\d{3}'
|
40 |
+
pruefer_pattern = r'(?<=\d)\r(.*)$'
|
41 |
+
|
42 |
+
# Extract LVA
|
43 |
+
lva = re.search(lva_pattern, input_string, re.DOTALL)
|
44 |
+
lva = lva.group(1).strip() if lva else ''
|
45 |
+
|
46 |
+
# Extract LVA Nr.
|
47 |
+
lva_nr = extract_substring(input_string)
|
48 |
+
|
49 |
+
# Extract Studium
|
50 |
+
i = input_string.index(lva_nr)
|
51 |
+
studium = input_string[len(lva):i][1:]
|
52 |
+
|
53 |
+
# Extract Prüfer'in
|
54 |
+
j = i + len(lva_nr)
|
55 |
+
pruefer = input_string[j:]
|
56 |
|
57 |
+
return {
|
58 |
+
'LVA': lva,
|
59 |
+
'Studium': studium,
|
60 |
+
'LVA Nr.': lva_nr,
|
61 |
+
'Prüfer\'in': pruefer
|
62 |
+
}
|
63 |
+
|
64 |
+
# Example usage
|
65 |
+
extracted_tables = extract_tables_from_pdf(temp_file_path)
|
66 |
+
|
67 |
+
# Create a new DataFrame
|
68 |
+
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
|
69 |
+
final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
|
70 |
+
|
71 |
+
for j in range(len(extracted_tables)):
|
72 |
+
# Data to append
|
73 |
+
try:
|
74 |
+
data = extracted_tables[j][1]
|
75 |
+
except:
|
76 |
+
print('Process finished!')
|
77 |
+
|
78 |
+
# Create a DataFrame from the data
|
79 |
+
data_df = pd.DataFrame(data)#, columns=new_df.columns)
|
80 |
+
|
81 |
+
# Concatenate the new DataFrame with the data DataFrame
|
82 |
+
new_df = pd.concat([new_df, data_df], ignore_index=True)
|
83 |
+
|
84 |
+
# New header row
|
85 |
+
new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
|
86 |
+
|
87 |
+
# Save the 0-th row
|
88 |
+
data_df.loc[-1] = data_df.columns
|
89 |
+
data_df.index = data_df.index + 1
|
90 |
+
data_df.sort_index(inplace=True)
|
91 |
+
|
92 |
+
# Assign the new header row
|
93 |
+
data_df.columns = new_header
|
94 |
+
|
95 |
+
# Define the index position to insert the new columns
|
96 |
+
insert_index = 1
|
97 |
+
|
98 |
+
# Add empty columns at the specified index
|
99 |
+
data_df.insert(insert_index, 'LVA-Nr.', '')
|
100 |
+
data_df.insert(insert_index, 'Studium', '')
|
101 |
+
data_df.insert(insert_index, 'Prüfer*in', '')
|
102 |
+
|
103 |
+
for i in range(len(data_df)):
|
104 |
+
input_string = data_df['LVA'][i]
|
105 |
+
information = extract_information(input_string)
|
106 |
+
|
107 |
+
data_df['LVA'][i] = information['LVA']
|
108 |
+
data_df['Studium'][i] = information['Studium']
|
109 |
+
data_df['LVA-Nr.'][i] = information['LVA Nr.']
|
110 |
+
data_df['Prüfer*in'][i] = information['Prüfer\'in']
|
111 |
+
|
112 |
+
final_df = pd.concat([final_df, data_df], ignore_index=True)
|
113 |
+
|
114 |
+
# Remove \r from a column
|
115 |
+
final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
|
116 |
+
|
117 |
+
# Convert the data type of 'ECTS' column to float
|
118 |
+
final_df['ECTS'] = final_df['ECTS'].astype(float)
|
119 |
+
|
120 |
+
# Sum the values in 'ECTS' column
|
121 |
+
column_sum = final_df['ECTS'].sum()
|
122 |
+
|
123 |
+
# Calculate weighted mean
|
124 |
+
vector = final_df['Beurteilung']
|
125 |
+
numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
|
126 |
+
ects = list(final_df['ECTS'])
|
127 |
+
wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
|
128 |
+
|
129 |
+
# Save the CSV file to the working directory
|
130 |
+
csv_save_path = os.path.join(os.getcwd(), "sample.csv")
|
131 |
+
final_df.to_csv(csv_save_path, index=False)
|
132 |
+
|
133 |
+
return column_sum, np.round(wm, 2), final_df, csv_save_path
|
134 |
+
|
135 |
+
# Define the Gradio interface
|
136 |
+
inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
|
137 |
+
|
138 |
+
outputs = [
|
139 |
+
gr.outputs.Textbox(label="Total ECTS"),
|
140 |
+
gr.outputs.Textbox(label="Weighted Mean"),
|
141 |
+
gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
|
142 |
+
gr.outputs.File(),
|
143 |
+
]
|
144 |
+
iface = gr.Interface(
|
145 |
+
fn=pdf_processing,
|
146 |
+
inputs=inputs,
|
147 |
+
outputs=outputs,
|
148 |
+
title="KUSSS Buddy",
|
149 |
+
)
|
150 |
+
|
151 |
+
# Launch the Gradio interface
|
152 |
+
iface.launch(share=True)
|
153 |
|
|