6QpgfMkKTVwUug commited on
Commit
fd48f46
·
1 Parent(s): cbab223

Final Version

Browse files
Files changed (1) hide show
  1. app.py +148 -16
app.py CHANGED
@@ -1,21 +1,153 @@
1
  import gradio as gr
2
- import PyPDF2
3
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def process_file(file):
6
- # Read the uploaded file
7
- with open(file.name, "rb") as f:
8
- # Process the file here (e.g., extract text or convert to DataFrame)
9
- # Example:
10
- pdf_reader = PyPDF2.PdfFileReader(f)
11
- num_pages = pdf_reader.numPages
12
- text = ""
13
- for page_num in range(num_pages):
14
- page = pdf_reader.getPage(page_num)
15
- text += page.extractText()
16
- # Return the processed output
17
- return text
 
 
 
 
 
 
 
 
18
 
19
- iface = gr.Interface(fn=process_file, inputs="file", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- iface.launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import tabula
4
+ import PyPDF2
5
+ import re
6
+ import numpy as np
7
+ import tempfile
8
+ import os
9
+
10
+ def pdf_processing(temp_file_path):
11
+ def extract_tables_from_pdf(temp_file_path):
12
+ # Open the PDF file in read-binary mode
13
+ with open(temp_file_path, 'rb') as file:
14
+ # Create a PDF reader object
15
+ pdf_reader = PyPDF2.PdfReader(temp_file_path)
16
+
17
+ # Initialize a list to store the extracted tables
18
+ tables = []
19
+
20
+ # Iterate over each page in the PDF
21
+ for page_number in range(len(pdf_reader.pages)):
22
+ # Extract the page as a DataFrame using tabula-py
23
+ df = tabula.read_pdf(temp_file_path, pages=page_number+1, multiple_tables=True)
24
+
25
+ # Append the extracted DataFrame to the tables list
26
+ tables.append(df)
27
+
28
+ return tables
29
+
30
+ def extract_substring(input_string):
31
+ pattern = r'(\d{3})\.[\w]{3}'
32
+ match = re.search(pattern, input_string)
33
+ return match.group() if match else ''
34
 
35
+ def extract_information(input_string):
36
+ # Define regex patterns for extracting information
37
+ lva_pattern = r'^(.*?)\r'
38
+ studium_pattern = r'\r(.*?)(?=\d)'
39
+ lva_nr_pattern = r'(\d{3})\.\d{3}'
40
+ pruefer_pattern = r'(?<=\d)\r(.*)$'
41
+
42
+ # Extract LVA
43
+ lva = re.search(lva_pattern, input_string, re.DOTALL)
44
+ lva = lva.group(1).strip() if lva else ''
45
+
46
+ # Extract LVA Nr.
47
+ lva_nr = extract_substring(input_string)
48
+
49
+ # Extract Studium
50
+ i = input_string.index(lva_nr)
51
+ studium = input_string[len(lva):i][1:]
52
+
53
+ # Extract Prüfer'in
54
+ j = i + len(lva_nr)
55
+ pruefer = input_string[j:]
56
 
57
+ return {
58
+ 'LVA': lva,
59
+ 'Studium': studium,
60
+ 'LVA Nr.': lva_nr,
61
+ 'Prüfer\'in': pruefer
62
+ }
63
+
64
+ # Example usage
65
+ extracted_tables = extract_tables_from_pdf(temp_file_path)
66
+
67
+ # Create a new DataFrame
68
+ new_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
69
+ final_df = pd.DataFrame(columns=['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung'])
70
+
71
+ for j in range(len(extracted_tables)):
72
+ # Data to append
73
+ try:
74
+ data = extracted_tables[j][1]
75
+ except:
76
+ print('Process finished!')
77
+
78
+ # Create a DataFrame from the data
79
+ data_df = pd.DataFrame(data)#, columns=new_df.columns)
80
+
81
+ # Concatenate the new DataFrame with the data DataFrame
82
+ new_df = pd.concat([new_df, data_df], ignore_index=True)
83
+
84
+ # New header row
85
+ new_header = ['LVA', 'Typ', 'SSt.', 'ECTS', 'Datum', 'Beurteilung']
86
+
87
+ # Save the 0-th row
88
+ data_df.loc[-1] = data_df.columns
89
+ data_df.index = data_df.index + 1
90
+ data_df.sort_index(inplace=True)
91
+
92
+ # Assign the new header row
93
+ data_df.columns = new_header
94
+
95
+ # Define the index position to insert the new columns
96
+ insert_index = 1
97
+
98
+ # Add empty columns at the specified index
99
+ data_df.insert(insert_index, 'LVA-Nr.', '')
100
+ data_df.insert(insert_index, 'Studium', '')
101
+ data_df.insert(insert_index, 'Prüfer*in', '')
102
+
103
+ for i in range(len(data_df)):
104
+ input_string = data_df['LVA'][i]
105
+ information = extract_information(input_string)
106
+
107
+ data_df['LVA'][i] = information['LVA']
108
+ data_df['Studium'][i] = information['Studium']
109
+ data_df['LVA-Nr.'][i] = information['LVA Nr.']
110
+ data_df['Prüfer*in'][i] = information['Prüfer\'in']
111
+
112
+ final_df = pd.concat([final_df, data_df], ignore_index=True)
113
+
114
+ # Remove \r from a column
115
+ final_df['Beurteilung'] = final_df['Beurteilung'].str.replace('\r', ' ')
116
+
117
+ # Convert the data type of 'ECTS' column to float
118
+ final_df['ECTS'] = final_df['ECTS'].astype(float)
119
+
120
+ # Sum the values in 'ECTS' column
121
+ column_sum = final_df['ECTS'].sum()
122
+
123
+ # Calculate weighted mean
124
+ vector = final_df['Beurteilung']
125
+ numerical_vector = list(map(lambda grade: {'sehr gut': 1, 'gut': 2, 'befriedigend': 3, 'genügend': 4, 'nicht genügend': 5, 'mit Erfolg teilgenommen': 0}.get(grade, grade), vector))
126
+ ects = list(final_df['ECTS'])
127
+ wm = np.sum(np.array(final_df['ECTS']) * numerical_vector) / column_sum
128
+
129
+ # Save the CSV file to the working directory
130
+ csv_save_path = os.path.join(os.getcwd(), "sample.csv")
131
+ final_df.to_csv(csv_save_path, index=False)
132
+
133
+ return column_sum, np.round(wm, 2), final_df, csv_save_path
134
+
135
+ # Define the Gradio interface
136
+ inputs = gr.inputs.Textbox(label="Enter the PDF file path:")
137
+
138
+ outputs = [
139
+ gr.outputs.Textbox(label="Total ECTS"),
140
+ gr.outputs.Textbox(label="Weighted Mean"),
141
+ gr.outputs.Dataframe(type="pandas", label="Processed DataFrame").style(full_width=True),
142
+ gr.outputs.File(),
143
+ ]
144
+ iface = gr.Interface(
145
+ fn=pdf_processing,
146
+ inputs=inputs,
147
+ outputs=outputs,
148
+ title="KUSSS Buddy",
149
+ )
150
+
151
+ # Launch the Gradio interface
152
+ iface.launch(share=True)
153