import streamlit as st import pandas as pd import numpy as np import fitz # PyMuPDF import re import json def extract_pdf_text(pdf_path): """Extract text from a PDF file.""" with fitz.open(pdf_path) as pdf_document: content_text = "" for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) content_text += page.get_text() + "\n" return content_text # Streamlit Application st.title("PDF Data Extractor") uploaded_file = st.file_uploader("Upload a PDF File", type="pdf") if uploaded_file is not None: with open("temp.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) pdf_text = extract_pdf_text("temp.pdf") # Step 2: Extract relevant information from the text using regex pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback" matches = re.findall(pattern, pdf_text) json_chunks = [] for match in matches: match = match.replace(",", ".") values = [value.strip() for value in match.split("\n") if value.strip()] if len(values) == 22: json_chunks.append({"current": values}) else: current = values[1::2] json_chunks.append({"current": current}) json_output = json.dumps(json_chunks, indent=2) json_data = json.loads(json_output) # Define the original data structure original_data = { 'Title': [ "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence", ], 'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12", "P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"], 'Dimensions': [ "Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility", "Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations", "Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity", "Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills", "Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor" ] } df = pd.DataFrame(original_data) # Add extracted scores to the DataFrame score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score'] for idx, col in enumerate(score_columns): df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current'])) score_pattern = r"\d{1,2},\d{2}" code_pattern = r"[A-Z]\.[0-9]{1,2}" all_scores = re.findall(score_pattern, pdf_text) all_codes = re.findall(code_pattern, pdf_text) scores = [float(score.replace(",", ".")) for score in all_scores] codes = [code.strip() for code in all_codes] if len(codes) >= 44: codes = codes[22:44] if len(scores) >= 22: scores = scores[0:22] df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores}) df_combined = pd.merge(df, df1, on="Code", how="inner") feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score'] df_combined[feature_cols] = df_combined[feature_cols].astype(float) def calculate_self_score(row): valid_features = [val for val in row[feature_cols] if pd.notna(val)] num_features = len(valid_features) if num_features > 1: sum_features = sum(valid_features) - row['All_raters_Score'] return (row['All_raters_Score'] * num_features) - sum_features return np.nan df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1) #Step 7 : Picking strengths and weaknesses # List of keywords/phrases to capture keywords = [ 'Integrity/ Reliability', 'Appearance', 'Enthusiasm/Passion', 'Learning Motivation/ Self-Development', 'Ability to Adapt/Flexibility', 'Communication/Information', 'Cooperation/ Team spirit', 'Handling of Complex Situations', 'Coolness/Handling of Unclear Situations', 'Self-reliance/Initiative', 'Conflict Management', 'Ability to Assert Oneself/ Negotiation Skills', 'Tact and Sensitivity', 'Quality Orientation', 'Client Orientation', 'Specialized Knowledge', 'Methodology/ Didactics/ Language', 'Creativity/ Conceptional Skills', 'Project Management', 'Result Orientation', 'Leadership Skills', 'Coach and Advisor' ] # Extract phrases between "Topics I would like to discuss... " and "Schedule for the follow-up meeting" phrases_pattern = r"Please use the form at the end of the section to finalize your development planning\.\s*(.*?)\s*Schedule for the follow-up meeting" phrases_matches = re.findall(phrases_pattern, pdf_text, re.DOTALL) # Extract the word after "The biggest strengths and room for improvements perceived by:" label_pattern = r"The biggest strengths and room for improvements perceived by:\s*(\w+)" labels = re.findall(label_pattern, pdf_text) # Process each match and extract only the required keywords json_output = [] for i, phrases_text in enumerate(phrases_matches): extracted_phrases = [ phrase for phrase in keywords if phrase in phrases_text ] if extracted_phrases: label = labels[i] if i < len(labels) else f"Phrases_{i+1}" json_output.append({label: extracted_phrases}) # Convert to JSON format json_output_str = json.dumps(json_output, indent=2) # Print the JSON result #print(json_output_str) json_data = df.to_json(orient='records') data = [] for item in json_output: for label, phrases in item.items(): for phrase in phrases: data.append({'Rater': label, 'Dimensions': phrase}) df4 = pd.DataFrame(data) #Step 9: Converting Streangths and Weaknesses with scores into json # Filter dataframes based on 'Label' value boss, direct, colleague, other_colleague = [df4[df4['Rater'] == label].copy() for label in ['Boss', 'Direct', 'Colleagues', 'Colleague (o']] # Create mapping dictionaries from df3 mappings = { 'Boss_score': df_combined.set_index('Dimensions')['Boss_score'].to_dict(), 'Report_score': df_combined.set_index('Dimensions')['Report_score'].to_dict(), 'Colleague_score': df_combined.set_index('Dimensions')['Colleague_score'].to_dict(), 'Other_colleague_score': df_combined.set_index('Dimensions')['Colleague_other_score'].to_dict() } # Map the values from df3 to the appropriate DataFrames boss['Boss_score'] = boss['Dimensions'].map(mappings['Boss_score']) direct['Report_score'] = direct['Dimensions'].map(mappings['Report_score']) colleague['Colleague_score'] = colleague['Dimensions'].map(mappings['Colleague_score']) other_colleague['Other_colleague_score'] = other_colleague['Dimensions'].map(mappings['Other_colleague_score']) boss = boss.sort_values(by = 'Boss_score', ascending = False).reset_index(drop = True) direct = direct.sort_values(by = 'Report_score', ascending = False).reset_index(drop = True) colleague = colleague.sort_values(by = 'Colleague_score', ascending = False).reset_index(drop = True) other_colleague = other_colleague.sort_values(by = 'Other_colleague_score', ascending = False).reset_index(drop = True) def assign_strength_weakness(df): df['Strength/Weakness'] = np.nan df.loc[df.index.isin([0, 1, 2]) & df['Score'].notna(), 'Strength/Weakness'] = 'S' df.loc[df.index.isin([3, 4, 5]) & df['Score'].notna(), 'Strength/Weakness'] = 'W' return df # Apply the function to each DataFrame boss = assign_strength_weakness(boss) direct = assign_strength_weakness(direct) colleague = assign_strength_weakness(colleague) other_colleague = assign_strength_weakness(other_colleague) df5 = pd.concat([boss, direct, colleague, other_colleague], axis = 0) df5 = df5.dropna() st.write("## Output:") st.write("### 1. Extracted dataset: Dimensions, Compentency Cluster, Raters and Scores by Raters") st.dataframe(df_combined) st.write("### 2. Extracted list of Strengths and Weaknesses rated by each Rater") st.write(df5)