Spaces:

marcelhuber
/

kusssbuddy

Sleeping

App Files Files Community

6QpgfMkKTVwUug commited on Jul 8, 2023

Commit

fd48f46

1 Parent(s): cbab223

Final Version

Browse files

Files changed (1) hide show

app.py +148 -16

app.py CHANGED Viewed

@@ -1,21 +1,153 @@
 import gradio as gr
-import PyPDF2
 import pandas as pd
-def process_file(file):
-    # Read the uploaded file
-    with open(file.name, "rb") as f:
-        # Process the file here (e.g., extract text or convert to DataFrame)
-        # Example:
-        pdf_reader = PyPDF2.PdfFileReader(f)
-        num_pages = pdf_reader.numPages
-        text = ""
-        for page_num in range(num_pages):
-            page = pdf_reader.getPage(page_num)
-            text += page.extractText()
-        # Return the processed output
-        return text
-iface = gr.Interface(fn=process_file, inputs="file", outputs="text")
-iface.launch()

 import gradio as gr
 import pandas as pd
+import tabula
+import PyPDF2
+import re
+import numpy as np
+import tempfile
+import os
+def pdf_processing(temp_file_path):
+    def extract_tables_from_pdf(temp_file_path):
+        # Open the PDF file in read-binary mode
+        with open(temp_file_path, 'rb') as file:
+            # Create a PDF reader object
+            pdf_reader = PyPDF2.PdfReader(temp_file_path)
+            # Initialize a list to store the extracted tables
+            tables = []
+            # Iterate over each page in the PDF
+            for page_number in range(len(pdf_reader.pages)):
+                # Extract the page as a DataFrame using tabula-py
+                df = tabula.read_pdf(temp_file_path, pages=page_number+1, multiple_tables=True)
+                # Append the extracted DataFrame to the tables list
+                tables.append(df)
+            return tables
+    def extract_substring(input_string):
+        pattern = r'(\d{3})\.[\w]{3}'
+        match = re.search(pattern, input_string)
+        return match.group() if match else ''
+    def extract_information(input_string):
+        # Define regex patterns for extracting information
+        lva_pattern = r'^(.*?)\r'
+        studium_pattern = r'\r(.*?)(?=\d)'
+        lva_nr_pattern = r'(\d{3})\.\d{3}'
+        pruefer_pattern = r'(?<=\d)\r(.*)$'
+        # Extract LVA
+        lva = re.search(lva_pattern, input_string, re.DOTALL)
+        lva = lva.group(1).strip() if lva else ''
+        # Extract LVA Nr.
+        lva_nr = extract_substring(input_string)
+        # Extract Studium
+        i = input_string.index(lva_nr)
+        studium = input_string[len(lva):i][1:]
+        # Extract Prüfer'in
+        j = i + len(lva_nr)
+        pruefer = input_string[j:]
+        return {
+            'LVA': lva,
+            'Studium': studium,
+            'LVA Nr.': lva_nr,
+            'Prüfer\'in': pruefer
+        }
+    # Example usage
+    extracted_tables = extract_tables_from_pdf(temp_file_path)
+    # Create a new DataFrame
+    new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
+    final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
+    for j in range(len(extracted_tables)):
+        # Data to append
+        try:
+            data = extracted_tables[j][1]
+        except:
+            print('Process finished!')
+        # Create a DataFrame from the data
+        data_df = pd.DataFrame(data)#, columns=new_df.columns)
+        # Concatenate the new DataFrame with the data DataFrame
+        new_df = pd.concat([new_df, data_df], ignore_index=True)
+        # New header row
+        new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
+        # Save the 0-th row
+        data_df.loc[-1] = data_df.columns
+        data_df.index = data_df.index + 1
+        data_df.sort_index(inplace=True)
+        # Assign the new header row
+        data_df.columns = new_header
+        # Define the index position to insert the new columns
+        insert_index = 1
+        # Add empty columns at the specified index
+        data_df.insert(insert_index, 'LVA-Nr.', '')
+        data_df.insert(insert_index, 'Studium', '')
+        data_df.insert(insert_index, 'Prüfer*in', '')
+        for i in range(len(data_df)):
+            input_string = data_df['LVA'][i]
+            information = extract_information(input_string)
+            data_df['LVA'][i] = information['LVA']
+            data_df['Studium'][i] = information['Studium']
+            data_df['LVA-Nr.'][i] = information['LVA Nr.']
+            data_df['Prüfer*in'][i] = information['Prüfer\'in']
+        final_df = pd.concat([final_df, data_df], ignore_index=True)
+    # Remove \r from a column
+    final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
+    # Convert the data type of 'ECTS' column to float
+    final_df['ECTS'] = final_df['ECTS'].astype(float)
+    # Sum the values in 'ECTS' column
+    column_sum = final_df['ECTS'].sum()
+    # Calculate weighted mean
+    vector = final_df['Beurteilung']
+    numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
+    ects = list(final_df['ECTS'])
+    wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
+    # Save the CSV file to the working directory
+    csv_save_path = os.path.join(os.getcwd(), "sample.csv")
+    final_df.to_csv(csv_save_path, index=False)
+    return column_sum, np.round(wm, 2), final_df, csv_save_path
+# Define the Gradio interface
+inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
+outputs = [
+    gr.outputs.Textbox(label="Total ECTS"),
+    gr.outputs.Textbox(label="Weighted Mean"),
+    gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
+    gr.outputs.File(),
+]
+iface = gr.Interface(
+    fn=pdf_processing,
+    inputs=inputs,
+    outputs=outputs,
+    title="KUSSS Buddy",
+    )
+# Launch the Gradio interface
+iface.launch(share=True)