File size: 11,826 Bytes
f64d424
 
 
 
 
 
 
152ef41
 
 
 
 
 
 
 
 
 
 
 
 
cbc5835
f64d424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d210806
ce17064
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d02524
ce17064
 
 
 
 
 
6d02524
ce17064
 
 
 
 
 
 
 
 
 
bbc90c5
 
 
 
 
 
 
 
 
b5cddd1
6d02524
4d9396f
 
 
 
6d02524
 
 
 
 
 
 
 
 
88164ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d02524
6046236
88164ab
990d712
f64d424
bbf7f54
7fb0758
6d02524
 
88164ab
 
990d712
c963632
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import streamlit as st
import pandas as pd
import numpy as np
import fitz  # PyMuPDF
import re
import json

# LICENSE.numpy.BSD-3 			- 	Copyright (c) 2005-2024, NumPy Developers (https://github.com/numpy/numpy/blob/main/LICENSE.txt)

# LICENSE.streamlit.Apachev2 	- 	Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE)

# LICENSE.pandas.BSD-3	 		- 	Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE)

# LICENSE.re.CNRI				-	Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. (https://www.handle.net/python_licenses/python1.6_9-5-00.html)


# LICENSE.json.LGPL				-	Copyright:    (c) 2017-2019 by Brad Jasper (c) 2012-2017 by Trevor Lohrbeer (https://github.com/bradjasper/ImportJSON/blob/master/LICENSE)

# LICENSE.pymupdf.AGPL			- 	Copyright (C) 2023 Artifex Software, Inc. (https://github.com/pymupdf/PyMuPDF/blob/main/COPYING)


def extract_pdf_text(pdf_path):
    """Extract text from a PDF file."""
    with fitz.open(pdf_path) as pdf_document:
        content_text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            content_text += page.get_text() + "\n"
    return content_text

# Streamlit Application
st.title("PDF Data Extractor")

uploaded_file = st.file_uploader("Upload a PDF File", type="pdf")

if uploaded_file is not None:
    with open("temp.pdf", "wb") as f:
        f.write(uploaded_file.getbuffer())

    pdf_text = extract_pdf_text("temp.pdf")

    # Step 2: Extract relevant information from the text using regex
    pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback"
    matches = re.findall(pattern, pdf_text)

    json_chunks = []
    for match in matches:
        match = match.replace(",", ".")
        values = [value.strip() for value in match.split("\n") if value.strip()]
        if len(values) == 22:
            json_chunks.append({"current": values})
        else:
            current = values[1::2]
            json_chunks.append({"current": current})

    json_output = json.dumps(json_chunks, indent=2)
    json_data = json.loads(json_output)

    # Define the original data structure
    original_data = {
        'Title': [
            "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
            "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
            "Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence",
            "Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence",
        ],
        'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12",
                 "P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"],
        'Dimensions': [
            "Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility",
            "Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations",
            "Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity",
            "Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills",
            "Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor"
        ]
    }

    df = pd.DataFrame(original_data)

    # Add extracted scores to the DataFrame
    score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score']
    for idx, col in enumerate(score_columns):
        df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current']))

    score_pattern = r"\d{1,2},\d{2}"
    code_pattern = r"[A-Z]\.[0-9]{1,2}"

    all_scores = re.findall(score_pattern, pdf_text)
    all_codes = re.findall(code_pattern, pdf_text)

    scores = [float(score.replace(",", ".")) for score in all_scores]
    codes = [code.strip() for code in all_codes]

    if len(codes) >= 44:
        codes = codes[22:44]
    if len(scores) >= 22:
        scores = scores[0:22]

    df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores})
    df_combined = pd.merge(df, df1, on="Code", how="inner")

    feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score']
    df_combined[feature_cols] = df_combined[feature_cols].astype(float)

    def calculate_self_score(row):
        valid_features = [val for val in row[feature_cols] if pd.notna(val)]
        num_features = len(valid_features)
        if num_features > 1:
            sum_features = sum(valid_features) - row['All_raters_Score']
            return (row['All_raters_Score'] * num_features) - sum_features
        return np.nan

    df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1)
    df_combined['Benchmark_score'] = np.random.uniform(4.8, 5.9, size=len(df_combined)).round(1)
     #Step 7 : Picking strengths and weaknesses
    # List of keywords/phrases to capture
    keywords = [
        'Integrity/ Reliability', 'Appearance', 'Enthusiasm/Passion',
        'Learning Motivation/ Self-Development', 'Ability to Adapt/Flexibility',
        'Communication/Information', 'Cooperation/ Team spirit',
        'Handling of Complex Situations', 'Coolness/Handling of Unclear Situations', 'Self-reliance/Initiative',
        'Conflict Management', 'Ability to Assert Oneself/ Negotiation Skills',
        'Tact and Sensitivity', 'Quality Orientation', 'Client Orientation',
        'Specialized Knowledge', 'Methodology/ Didactics/ Language',
        'Creativity/ Conceptional Skills', 'Project Management',
        'Result Orientation', 'Leadership Skills', 'Coach and Advisor'
    ]
    
    # Extract phrases between "Topics I would like to discuss... " and "Schedule for the follow-up meeting"
    phrases_pattern = r"Please use the form at the end of the section to finalize your development planning\.\s*(.*?)\s*Schedule for the follow-up meeting"
    phrases_matches = re.findall(phrases_pattern, pdf_text, re.DOTALL)
    
    # Extract the word after "The biggest strengths and room for improvements perceived by:"
    label_pattern = r"The biggest strengths and room for improvements perceived by:\s*(\w+)"
    labels = re.findall(label_pattern, pdf_text)
    
    # Process each match and extract only the required keywords
    json_output = []
    for i, phrases_text in enumerate(phrases_matches):
        extracted_phrases = [
            phrase for phrase in keywords if phrase in phrases_text
        ]
        if extracted_phrases:
            label = labels[i] if i < len(labels) else f"Phrases_{i+1}"
            json_output.append({label: extracted_phrases})
    
    # Convert to JSON format
    json_output_str = json.dumps(json_output, indent=2)
    
    # Print the JSON result
    #print(json_output_str)
    
    json_data = df.to_json(orient='records')
    
    data = []
    for item in json_output:
        for label, phrases in item.items():
            for phrase in phrases:
                data.append({'Rater': label, 'Dimensions': phrase})
    
    df4 = pd.DataFrame(data)

    #Step 9: Converting Streangths and Weaknesses with scores into json

    # Filter dataframes based on 'Label' value
    boss, direct, colleague, other_colleague = [df4[df4['Rater'] == label].copy() for label in ['Boss', 'Direct', 'Colleagues', 'Colleague (o']]
    
    # Create mapping dictionaries from df3
    mappings = {
        'Boss_score': df_combined.set_index('Dimensions')['Boss_score'].to_dict(),
        'Report_score': df_combined.set_index('Dimensions')['Report_score'].to_dict(),
        'Colleague_score': df_combined.set_index('Dimensions')['Colleague_score'].to_dict(),
        'Other_colleague_score': df_combined.set_index('Dimensions')['Colleague_other_score'].to_dict()
    }
    
    # Map the values from df3 to the appropriate DataFrames
    boss['Score'] = boss['Dimensions'].map(mappings['Boss_score'])
    direct['Score'] = direct['Dimensions'].map(mappings['Report_score'])
    colleague['Score'] = colleague['Dimensions'].map(mappings['Colleague_score'])
    other_colleague['Score'] = other_colleague['Dimensions'].map(mappings['Other_colleague_score'])

    boss = boss.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
    direct = direct.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
    colleague = colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
    other_colleague = other_colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True)

    def assign_strength_weakness(df):
        df['Strength/Weakness'] = np.nan
        df.loc[df.index.isin([0, 1, 2]) & df['Score'].notna(), 'Strength/Weakness'] = 'S'
        df.loc[df.index.isin([3, 4, 5]) & df['Score'].notna(), 'Strength/Weakness'] = 'W'
        return df

    # Apply the function to each DataFrame
    boss = assign_strength_weakness(boss)
    direct = assign_strength_weakness(direct)
    colleague = assign_strength_weakness(colleague)
    other_colleague = assign_strength_weakness(other_colleague)

    df5 = pd.concat([boss, direct, colleague, other_colleague], axis = 0)
    df5 = df5.dropna()

    sections = [
    "Continue doing the following",
    "Start doing the following",
    "Reasons why I think that your behavior has worsened concerning the dimensions marked in the \"Perception & Change Section\" of the questionnaire",
    "Further tips for your work in our organisation"
    ]
    
    patterns = {
        "Boss": r"VG\n(.*?)(?=\(Boss\))",
        "Colleagues": r"Ke\n(.*?)(?=\(Colleagues\))",
        "Customers": r"KU\n(.*?)(?=\(Internal/external customers\))"
    }
    
    # Function to extract comments for each section
    def extract_comments(data, section):
        section_pattern = rf"Kom\s+{re.escape(section)}:\n(.*?)(?=(?:IX\. Open Comments|$))"
        section_data = re.search(section_pattern, data, re.DOTALL)
        
        if not section_data:
            return []
    
        section_text = section_data.group(1)
        comments = []
    
        for rater, pattern in patterns.items():
            matches = re.findall(pattern, section_text, re.DOTALL)
            for match in matches:
                comments.append({
                    "Section": section,
                    "Rater": rater,
                    "Comment": match.strip()
                })
    
        return comments
    
    # Create dataframes for each section
    all_comments = []
    for section in sections:
        all_comments.extend(extract_comments(pdf_text, section))
    
    df6 = pd.DataFrame(all_comments)
    
  
    st.write("## Output:")
    
    st.write("### 1. Dataset: Compentency Cluster, Code, Dimensions, Raters and Score")
    st.dataframe(df_combined)
    st.write("#### Note: The Self Score is calculated as: (All Raters × Number of Raters) − (Sum of Rater Scores)")
    
    st.write("### 2. Extracted list of Strengths and Weaknesses rated by each Rater")
    st.write(df5)

    st.write("### 3. Extracted list of Open Comments by each Rater")
    st.write(df6)
    st.write("#### Note: This extraction is not 100% able to extract each Rater comments / feedback. This is will be improved")