Spaces:
Sleeping
Sleeping
File size: 5,249 Bytes
d223d6c cbab223 fd48f46 a49df92 fd48f46 a49df92 fd48f46 a49df92 fd48f46 a49df92 fd48f46 a49df92 fd48f46 a49df92 fd48f46 d223d6c fd48f46 cbab223 fd48f46 a49df92 fd48f46 a49df92 fd48f46 a49df92 d223d6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import gradio as gr
import pandas as pd
import tabula
from PyPDF2 import PdfReader
import re
import numpy as np
import os
import tempfile
import shutil
def pdf_processing(pdf_file):
def extract_tables_from_pdf(pdf_file):
# Open the PDF file in read-binary mode
with open(pdf_file.name, 'rb') as file:
# Create a PDF reader object
pdf_reader = PdfReader(file)
# Initialize a list to store the extracted tables
tables = []
# Iterate over each page in the PDF
for page_number in range(len(pdf_reader.pages)):
# Extract the page as a DataFrame using tabula-py
df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)
# Append the extracted DataFrame to the tables list
tables.append(df)
return tables
def extract_substring(input_string):
pattern = r'(\d{3})\.[\w]{3}'
match = re.search(pattern, input_string)
return match.group() if match else ''
def extract_information(input_string):
# Define regex patterns for extracting information
lva_pattern = r'^(.*?)\r'
studium_pattern = r'\r(.*?)(?=\d)'
lva_nr_pattern = r'(\d{3})\.\d{3}'
pruefer_pattern = r'(?<=\d)\r(.*)$'
# Extract LVA
lva = re.search(lva_pattern, input_string, re.DOTALL)
lva = lva.group(1).strip() if lva else ''
# Extract LVA Nr.
lva_nr = extract_substring(input_string)
# Extract Studium
i = input_string.index(lva_nr)
studium = input_string[len(lva):i][1:]
# Extract Prüfer'in
j = i + len(lva_nr)
pruefer = input_string[j:]
return {
'LVA': lva,
'Studium': studium,
'LVA Nr.': lva_nr,
'Prüfer\'in': pruefer
}
# Example usage
extracted_tables = extract_tables_from_pdf(pdf_file)
# Create a new DataFrame
new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
for j in range(len(extracted_tables)):
# Data to append
try:
data = extracted_tables[j][1]
except:
print('Process finished!')
# Create a DataFrame from the data
data_df = pd.DataFrame(data)#, columns=new_df.columns)
# Concatenate the new DataFrame with the data DataFrame
new_df = pd.concat([new_df, data_df], ignore_index=True)
# New header row
new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
# Save the 0-th row
data_df.loc[-1] = data_df.columns
data_df.index = data_df.index + 1
data_df.sort_index(inplace=True)
# Assign the new header row
data_df.columns = new_header
# Define the index position to insert the new columns
insert_index = 1
# Add empty columns at the specified index
data_df.insert(insert_index, 'LVA-Nr.', '')
data_df.insert(insert_index, 'Studium', '')
data_df.insert(insert_index, 'Prüfer*in', '')
for i in range(len(data_df)):
input_string = data_df['LVA'][i]
information = extract_information(input_string)
data_df['LVA'][i] = information['LVA']
data_df['Studium'][i] = information['Studium']
data_df['LVA-Nr.'][i] = information['LVA Nr.']
data_df['Prüfer*in'][i] = information['Prüfer\'in']
final_df = pd.concat([final_df, data_df], ignore_index=True)
# Remove \r from a column
final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
# Convert the data type of 'ECTS' column to float
final_df['ECTS'] = final_df['ECTS'].astype(float)
# Sum the values in 'ECTS' column
column_sum = final_df['ECTS'].sum()
# Calculate weighted mean
vector = final_df['Beurteilung']
numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
ects = list(final_df['ECTS'])
wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
# Save the CSV file to the working directory
csv_save_path = os.path.join(os.getcwd(), "sample.csv")
final_df.to_csv(csv_save_path, index=False)
return column_sum, np.round(wm, 2), final_df, csv_save_path
# Define the Gradio interface
inputs = gr.inputs.File(label="Select PDF file", type="file")
#inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
outputs = [
gr.outputs.Textbox(label="Total ECTS"),
gr.outputs.Textbox(label="Weighted Mean"),
gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
gr.outputs.File(),
]
iface = gr.Interface(
fn=pdf_processing,
inputs=inputs,
outputs=outputs,
title="KUSSS Buddy",
)
# Launch the Gradio interface
iface.launch(share=False)
|