Spaces:

marcelhuber
/

kusssbuddy

Sleeping

File size: 5,249 Bytes

import gradio as gr
import pandas as pd
import tabula
from PyPDF2 import PdfReader
import re
import numpy as np
import os
import tempfile
import shutil

def pdf_processing(pdf_file):
    def extract_tables_from_pdf(pdf_file):      
        # Open the PDF file in read-binary mode
        with open(pdf_file.name, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PdfReader(file)
            
            # Initialize a list to store the extracted tables
            tables = []
            
            # Iterate over each page in the PDF
            for page_number in range(len(pdf_reader.pages)):
                # Extract the page as a DataFrame using tabula-py
                df = tabula.read_pdf(pdf_file.name, pages=page_number+1, multiple_tables=True)
                
                # Append the extracted DataFrame to the tables list
                tables.append(df)
                
            return tables

    def extract_substring(input_string):
        pattern = r'(\d{3})\.[\w]{3}'
        match = re.search(pattern, input_string)
        return match.group() if match else ''

    def extract_information(input_string):
        # Define regex patterns for extracting information
        lva_pattern = r'^(.*?)\r'
        studium_pattern = r'\r(.*?)(?=\d)'
        lva_nr_pattern = r'(\d{3})\.\d{3}'
        pruefer_pattern = r'(?<=\d)\r(.*)$'
        
        # Extract LVA
        lva = re.search(lva_pattern, input_string, re.DOTALL)
        lva = lva.group(1).strip() if lva else ''
        
        # Extract LVA Nr.
        lva_nr = extract_substring(input_string)
        
        # Extract Studium
        i = input_string.index(lva_nr)
        studium = input_string[len(lva):i][1:]
        
        # Extract Prüfer'in
        j = i + len(lva_nr)
        pruefer = input_string[j:]
        
        return {
            'LVA': lva,
            'Studium': studium,
            'LVA Nr.': lva_nr,
            'Prüfer\'in': pruefer
        }

    # Example usage
    extracted_tables = extract_tables_from_pdf(pdf_file)

    # Create a new DataFrame
    new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
    final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])

    for j in range(len(extracted_tables)):
        # Data to append
        try:
            data = extracted_tables[j][1]
        except:
            print('Process finished!')

        # Create a DataFrame from the data
        data_df = pd.DataFrame(data)#, columns=new_df.columns)

        # Concatenate the new DataFrame with the data DataFrame
        new_df = pd.concat([new_df, data_df], ignore_index=True)

        # New header row
        new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']

        # Save the 0-th row
        data_df.loc[-1] = data_df.columns
        data_df.index = data_df.index + 1
        data_df.sort_index(inplace=True)

        # Assign the new header row
        data_df.columns = new_header

        # Define the index position to insert the new columns
        insert_index = 1

        # Add empty columns at the specified index
        data_df.insert(insert_index, 'LVA-Nr.', '')
        data_df.insert(insert_index, 'Studium', '')
        data_df.insert(insert_index, 'Prüfer*in', '')

        for i in range(len(data_df)):
            input_string = data_df['LVA'][i]
            information = extract_information(input_string)

            data_df['LVA'][i] = information['LVA']
            data_df['Studium'][i] = information['Studium']
            data_df['LVA-Nr.'][i] = information['LVA Nr.']
            data_df['Prüfer*in'][i] = information['Prüfer\'in']
            
        final_df = pd.concat([final_df, data_df], ignore_index=True)
        
    # Remove \r from a column
    final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')

    # Convert the data type of 'ECTS' column to float
    final_df['ECTS'] = final_df['ECTS'].astype(float)

    # Sum the values in 'ECTS' column
    column_sum = final_df['ECTS'].sum()

    # Calculate weighted mean
    vector = final_df['Beurteilung']
    numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
    ects = list(final_df['ECTS'])
    wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum

    # Save the CSV file to the working directory
    csv_save_path = os.path.join(os.getcwd(), "sample.csv")
    final_df.to_csv(csv_save_path, index=False)

    return column_sum, np.round(wm, 2), final_df, csv_save_path

# Define the Gradio interface
inputs = gr.inputs.File(label="Select PDF file", type="file")
#inputs = gr.inputs.Textbox(label="Enter the PDF file path:")

outputs = [
    gr.outputs.Textbox(label="Total ECTS"),
    gr.outputs.Textbox(label="Weighted Mean"),    
    gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
    gr.outputs.File(),
]
iface = gr.Interface(
    fn=pdf_processing, 
    inputs=inputs, 
    outputs=outputs,
    title="KUSSS Buddy",
    )

# Launch the Gradio interface
iface.launch(share=False)