Spaces:
Running
Running
File size: 11,826 Bytes
f64d424 152ef41 cbc5835 f64d424 d210806 ce17064 6d02524 ce17064 6d02524 ce17064 bbc90c5 b5cddd1 6d02524 4d9396f 6d02524 88164ab 6d02524 6046236 88164ab 990d712 f64d424 bbf7f54 7fb0758 6d02524 88164ab 990d712 c963632 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
import streamlit as st
import pandas as pd
import numpy as np
import fitz # PyMuPDF
import re
import json
# LICENSE.numpy.BSD-3 - Copyright (c) 2005-2024, NumPy Developers (https://github.com/numpy/numpy/blob/main/LICENSE.txt)
# LICENSE.streamlit.Apachev2 - Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024) (https://github.com/streamlit/streamlit/blob/develop/LICENSE)
# LICENSE.pandas.BSD-3 - Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team (https://github.com/pandas-dev/pandas/blob/main/LICENSE)
# LICENSE.re.CNRI - Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. (https://www.handle.net/python_licenses/python1.6_9-5-00.html)
# LICENSE.json.LGPL - Copyright: (c) 2017-2019 by Brad Jasper (c) 2012-2017 by Trevor Lohrbeer (https://github.com/bradjasper/ImportJSON/blob/master/LICENSE)
# LICENSE.pymupdf.AGPL - Copyright (C) 2023 Artifex Software, Inc. (https://github.com/pymupdf/PyMuPDF/blob/main/COPYING)
def extract_pdf_text(pdf_path):
"""Extract text from a PDF file."""
with fitz.open(pdf_path) as pdf_document:
content_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
content_text += page.get_text() + "\n"
return content_text
# Streamlit Application
st.title("PDF Data Extractor")
uploaded_file = st.file_uploader("Upload a PDF File", type="pdf")
if uploaded_file is not None:
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
pdf_text = extract_pdf_text("temp.pdf")
# Step 2: Extract relevant information from the text using regex
pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback"
matches = re.findall(pattern, pdf_text)
json_chunks = []
for match in matches:
match = match.replace(",", ".")
values = [value.strip() for value in match.split("\n") if value.strip()]
if len(values) == 22:
json_chunks.append({"current": values})
else:
current = values[1::2]
json_chunks.append({"current": current})
json_output = json.dumps(json_chunks, indent=2)
json_data = json.loads(json_output)
# Define the original data structure
original_data = {
'Title': [
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence",
"Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence",
"Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence",
],
'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12",
"P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"],
'Dimensions': [
"Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility",
"Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations",
"Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity",
"Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills",
"Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor"
]
}
df = pd.DataFrame(original_data)
# Add extracted scores to the DataFrame
score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score']
for idx, col in enumerate(score_columns):
df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current']))
score_pattern = r"\d{1,2},\d{2}"
code_pattern = r"[A-Z]\.[0-9]{1,2}"
all_scores = re.findall(score_pattern, pdf_text)
all_codes = re.findall(code_pattern, pdf_text)
scores = [float(score.replace(",", ".")) for score in all_scores]
codes = [code.strip() for code in all_codes]
if len(codes) >= 44:
codes = codes[22:44]
if len(scores) >= 22:
scores = scores[0:22]
df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores})
df_combined = pd.merge(df, df1, on="Code", how="inner")
feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score']
df_combined[feature_cols] = df_combined[feature_cols].astype(float)
def calculate_self_score(row):
valid_features = [val for val in row[feature_cols] if pd.notna(val)]
num_features = len(valid_features)
if num_features > 1:
sum_features = sum(valid_features) - row['All_raters_Score']
return (row['All_raters_Score'] * num_features) - sum_features
return np.nan
df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1)
df_combined['Benchmark_score'] = np.random.uniform(4.8, 5.9, size=len(df_combined)).round(1)
#Step 7 : Picking strengths and weaknesses
# List of keywords/phrases to capture
keywords = [
'Integrity/ Reliability', 'Appearance', 'Enthusiasm/Passion',
'Learning Motivation/ Self-Development', 'Ability to Adapt/Flexibility',
'Communication/Information', 'Cooperation/ Team spirit',
'Handling of Complex Situations', 'Coolness/Handling of Unclear Situations', 'Self-reliance/Initiative',
'Conflict Management', 'Ability to Assert Oneself/ Negotiation Skills',
'Tact and Sensitivity', 'Quality Orientation', 'Client Orientation',
'Specialized Knowledge', 'Methodology/ Didactics/ Language',
'Creativity/ Conceptional Skills', 'Project Management',
'Result Orientation', 'Leadership Skills', 'Coach and Advisor'
]
# Extract phrases between "Topics I would like to discuss... " and "Schedule for the follow-up meeting"
phrases_pattern = r"Please use the form at the end of the section to finalize your development planning\.\s*(.*?)\s*Schedule for the follow-up meeting"
phrases_matches = re.findall(phrases_pattern, pdf_text, re.DOTALL)
# Extract the word after "The biggest strengths and room for improvements perceived by:"
label_pattern = r"The biggest strengths and room for improvements perceived by:\s*(\w+)"
labels = re.findall(label_pattern, pdf_text)
# Process each match and extract only the required keywords
json_output = []
for i, phrases_text in enumerate(phrases_matches):
extracted_phrases = [
phrase for phrase in keywords if phrase in phrases_text
]
if extracted_phrases:
label = labels[i] if i < len(labels) else f"Phrases_{i+1}"
json_output.append({label: extracted_phrases})
# Convert to JSON format
json_output_str = json.dumps(json_output, indent=2)
# Print the JSON result
#print(json_output_str)
json_data = df.to_json(orient='records')
data = []
for item in json_output:
for label, phrases in item.items():
for phrase in phrases:
data.append({'Rater': label, 'Dimensions': phrase})
df4 = pd.DataFrame(data)
#Step 9: Converting Streangths and Weaknesses with scores into json
# Filter dataframes based on 'Label' value
boss, direct, colleague, other_colleague = [df4[df4['Rater'] == label].copy() for label in ['Boss', 'Direct', 'Colleagues', 'Colleague (o']]
# Create mapping dictionaries from df3
mappings = {
'Boss_score': df_combined.set_index('Dimensions')['Boss_score'].to_dict(),
'Report_score': df_combined.set_index('Dimensions')['Report_score'].to_dict(),
'Colleague_score': df_combined.set_index('Dimensions')['Colleague_score'].to_dict(),
'Other_colleague_score': df_combined.set_index('Dimensions')['Colleague_other_score'].to_dict()
}
# Map the values from df3 to the appropriate DataFrames
boss['Score'] = boss['Dimensions'].map(mappings['Boss_score'])
direct['Score'] = direct['Dimensions'].map(mappings['Report_score'])
colleague['Score'] = colleague['Dimensions'].map(mappings['Colleague_score'])
other_colleague['Score'] = other_colleague['Dimensions'].map(mappings['Other_colleague_score'])
boss = boss.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
direct = direct.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
colleague = colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
other_colleague = other_colleague.sort_values(by = 'Score', ascending = False).reset_index(drop = True)
def assign_strength_weakness(df):
df['Strength/Weakness'] = np.nan
df.loc[df.index.isin([0, 1, 2]) & df['Score'].notna(), 'Strength/Weakness'] = 'S'
df.loc[df.index.isin([3, 4, 5]) & df['Score'].notna(), 'Strength/Weakness'] = 'W'
return df
# Apply the function to each DataFrame
boss = assign_strength_weakness(boss)
direct = assign_strength_weakness(direct)
colleague = assign_strength_weakness(colleague)
other_colleague = assign_strength_weakness(other_colleague)
df5 = pd.concat([boss, direct, colleague, other_colleague], axis = 0)
df5 = df5.dropna()
sections = [
"Continue doing the following",
"Start doing the following",
"Reasons why I think that your behavior has worsened concerning the dimensions marked in the \"Perception & Change Section\" of the questionnaire",
"Further tips for your work in our organisation"
]
patterns = {
"Boss": r"VG\n(.*?)(?=\(Boss\))",
"Colleagues": r"Ke\n(.*?)(?=\(Colleagues\))",
"Customers": r"KU\n(.*?)(?=\(Internal/external customers\))"
}
# Function to extract comments for each section
def extract_comments(data, section):
section_pattern = rf"Kom\s+{re.escape(section)}:\n(.*?)(?=(?:IX\. Open Comments|$))"
section_data = re.search(section_pattern, data, re.DOTALL)
if not section_data:
return []
section_text = section_data.group(1)
comments = []
for rater, pattern in patterns.items():
matches = re.findall(pattern, section_text, re.DOTALL)
for match in matches:
comments.append({
"Section": section,
"Rater": rater,
"Comment": match.strip()
})
return comments
# Create dataframes for each section
all_comments = []
for section in sections:
all_comments.extend(extract_comments(pdf_text, section))
df6 = pd.DataFrame(all_comments)
st.write("## Output:")
st.write("### 1. Dataset: Compentency Cluster, Code, Dimensions, Raters and Score")
st.dataframe(df_combined)
st.write("#### Note: The Self Score is calculated as: (All Raters × Number of Raters) − (Sum of Rater Scores)")
st.write("### 2. Extracted list of Strengths and Weaknesses rated by each Rater")
st.write(df5)
st.write("### 3. Extracted list of Open Comments by each Rater")
st.write(df6)
st.write("#### Note: This extraction is not 100% able to extract each Rater comments / feedback. This is will be improved") |