Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import fitz # PyMuPDF | |
import re | |
import json | |
def extract_pdf_text(pdf_path): | |
"""Extract text from a PDF file.""" | |
with fitz.open(pdf_path) as pdf_document: | |
content_text = "" | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
content_text += page.get_text() + "\n" | |
return content_text | |
# Streamlit Application | |
st.title("PDF Data Extractor") | |
uploaded_file = st.file_uploader("Upload a PDF File", type="pdf") | |
if uploaded_file is not None: | |
with open("temp.pdf", "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
pdf_text = extract_pdf_text("temp.pdf") | |
# Step 2: Extract relevant information from the text using regex | |
pattern = r"2\s*3\s*4\s*5\s*\n-1,5\s*0([\s\S]*?)\n\nTrainer & Berater-Feedback" | |
matches = re.findall(pattern, pdf_text) | |
json_chunks = [] | |
for match in matches: | |
match = match.replace(",", ".") | |
values = [value.strip() for value in match.split("\n") if value.strip()] | |
if len(values) == 22: | |
json_chunks.append({"current": values}) | |
else: | |
current = values[1::2] | |
json_chunks.append({"current": current}) | |
json_output = json.dumps(json_chunks, indent=2) | |
json_data = json.loads(json_output) | |
# Define the original data structure | |
original_data = { | |
'Title': [ | |
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", | |
"Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", "Personal Competence", | |
"Personal Competence", "Personal Competence", "Business Competence", "Business Competence", "Business Competence", "Business Competence", | |
"Business Competence", "Management Competence", "Management Competence", "Management Competence", "Management Competence", | |
], | |
'Code': ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12", | |
"P13", "B1", "B2", "B3", "B4", "B5", "M1", "M2", "M3", "M4"], | |
'Dimensions': [ | |
"Integrity/ Reliability", "Appearance", "Enthusiasm/Passion", "Learning Motivation/ Self-Development", "Ability to Adapt/Flexibility", | |
"Communication/Information", "Cooperation/ Team spirit", "Handling of Complex Situations", "Coolness/Handling of Unclear Situations", | |
"Self-reliance/Initiative", "Conflict Management", "Ability to Assert Oneself/ Negotiation Skills", "Tact and Sensitivity", | |
"Quality Orientation", "Client Orientation", "Specialized Knowledge", "Methodology/ Didactics/ Language", "Creativity/ Conceptional Skills", | |
"Project Management", "Result Orientation", "Leadership Skills", "Coach and Advisor" | |
] | |
} | |
df = pd.DataFrame(original_data) | |
# Add extracted scores to the DataFrame | |
score_columns = ['Boss_score', 'Colleague_score', 'Colleague_other_score', 'Report_score', 'Customer_score'] | |
for idx, col in enumerate(score_columns): | |
df[col] = json_data[idx]['current'] + [None] * (len(df) - len(json_data[idx]['current'])) | |
score_pattern = r"\d{1,2},\d{2}" | |
code_pattern = r"[A-Z]\.[0-9]{1,2}" | |
all_scores = re.findall(score_pattern, pdf_text) | |
all_codes = re.findall(code_pattern, pdf_text) | |
scores = [float(score.replace(",", ".")) for score in all_scores] | |
codes = [code.strip() for code in all_codes] | |
if len(codes) >= 44: | |
codes = codes[22:44] | |
if len(scores) >= 22: | |
scores = scores[0:22] | |
df1 = pd.DataFrame({'Code': [code.replace('.', '') for code in codes], 'All_raters_Score': scores}) | |
df_combined = pd.merge(df, df1, on="Code", how="inner") | |
feature_cols = ['Boss_score', 'Colleague_score', 'Report_score', 'Customer_score', 'Colleague_other_score'] | |
df_combined[feature_cols] = df_combined[feature_cols].astype(float) | |
def calculate_self_score(row): | |
valid_features = [val for val in row[feature_cols] if pd.notna(val)] | |
num_features = len(valid_features) | |
if num_features > 1: | |
sum_features = sum(valid_features) - row['All_raters_Score'] | |
return (row['All_raters_Score'] * num_features) - sum_features | |
return np.nan | |
df_combined['Self_score'] = df_combined.apply(calculate_self_score, axis=1) | |
# Display the resultant DataFrame | |
st.write("### Extracted Dataset") | |
st.dataframe(df_combined) | |