Vineedhar commited on
Commit
f64d424
·
verified ·
1 Parent(s): ccc10b2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import fitz # PyMuPDF
5
+ import re
6
+ import json
7
+
8
+ def extract_pdf_text(pdf_path):
9
+ """Extract text from a PDF file."""
10
+ with fitz.open(pdf_path) as pdf_document:
11
+ content_text = ""
12
+ for page_num in range(len(pdf_document)):
13
+ page = pdf_document.load_page(page_num)
14
+ content_text += page.get_text() + "\n"
15
+ return content_text
16
+
17
+ # Streamlit Application
18
+ st.title("PDF Data Extractor")
19
+
20
+ uploaded_file = st.file_uploader("Upload a PDF File", type="pdf")
21
+
22
+ if uploaded_file is not None:
23
+ with open("temp.pdf", "wb") as f:
24
+ f.write(uploaded_file.getbuffer())
25
+
26
+ pdf_text = extract_pdf_text("temp.pdf")
27
+
28
+ # Step 2: Extract relevant information from the text using regex
29
+ pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback"
30
+ matches = re.findall(pattern, pdf_text)
31
+
32
+ json_chunks = []
33
+ for match in matches:
34
+ match = match.replace(",", ".")
35
+ values = [value.strip() for value in match.split("\n") if value.strip()]
36
+ if len(values) == 22:
37
+ json_chunks.append({"current": values})
38
+ else:
39
+ current = values[1::2]
40
+ json_chunks.append({"current": current})
41
+
42
+ json_output = json.dumps(json_chunks, indent=2)
43
+ json_data = json.loads(json_output)
44
+
45
+ # Define the original data structure
46
+ original_data = {
47
+ 'Title': [
48
+ "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
49
+ "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
50
+ "Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence",
51
+ "Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence",
52
+ ],
53
+ 'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12",
54
+ "P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"],
55
+ 'Dimensions': [
56
+ "Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility",
57
+ "Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations",
58
+ "Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity",
59
+ "Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills",
60
+ "Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor"
61
+ ]
62
+ }
63
+
64
+ df = pd.DataFrame(original_data)
65
+
66
+ # Add extracted scores to the DataFrame
67
+ score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score']
68
+ for idx, col in enumerate(score_columns):
69
+ df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current']))
70
+
71
+ score_pattern = r"\d{1,2},\d{2}"
72
+ code_pattern = r"[A-Z]\.[0-9]{1,2}"
73
+
74
+ all_scores = re.findall(score_pattern, pdf_text)
75
+ all_codes = re.findall(code_pattern, pdf_text)
76
+
77
+ scores = [float(score.replace(",", ".")) for score in all_scores]
78
+ codes = [code.strip() for code in all_codes]
79
+
80
+ if len(codes) >= 44:
81
+ codes = codes[22:44]
82
+ if len(scores) >= 22:
83
+ scores = scores[0:22]
84
+
85
+ df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores})
86
+ df_combined = pd.merge(df, df1, on="Code", how="inner")
87
+
88
+ feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score']
89
+ df_combined[feature_cols] = df_combined[feature_cols].astype(float)
90
+
91
+ def calculate_self_score(row):
92
+ valid_features = [val for val in row[feature_cols] if pd.notna(val)]
93
+ num_features = len(valid_features)
94
+ if num_features > 1:
95
+ sum_features = sum(valid_features) - row['All_raters_Score']
96
+ return (row['All_raters_Score'] * num_features) - sum_features
97
+ return np.nan
98
+
99
+ df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1)
100
+
101
+ # Display the resultant DataFrame
102
+ st.write("### Extracted Dataset")
103
+ st.dataframe(df_combined)