Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import pandas as pd | |
import os | |
import ast | |
import streamlit as st | |
import pandas as pd | |
import os | |
from google.oauth2.credentials import Credentials | |
from google.auth.transport.requests import Request | |
from google_auth_oauthlib.flow import InstalledAppFlow | |
from googleapiclient.discovery import build | |
from googleapiclient.http import MediaIoBaseDownload,MediaFileUpload | |
from google.oauth2 import service_account | |
import base64 | |
def get_image_as_base64(image_path): | |
with open(image_path, "rb") as img_file: | |
return base64.b64encode(img_file.read()).decode() | |
# Load credentials from environment variables | |
SERVICE_ACCOUNT_INFO = { | |
"type": "service_account", | |
"project_id": os.environ.get("project_id"), | |
"private_key_id": os.environ.get("private_key_id"), | |
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQCU+RlXQVzTifBI\nqDoj1yt2sLvFepy9edxEjAwAdyM0ICesvB6SCFpVGV3dV19c1PiwqqsnzGSGz3ji\nnSOxFXEve5pXFziCDVVIeaQ7g59GYLHOfKUtvqGewpxXm6gP1yrzHprpbHpBomXW\nnZNvIZ/SWzn8n4CjWV5A4V1mqfZq9OkBlJRsjJ0gh84g7QMQZ45lBrev1BUK5DtY\n8lMjP5XkQJGT4Z4km+qdsjMf+QCKLU5U2dsvNp1vViElKCwok9PfCeF+/fYX7WW5\nddpNkFC8/iOK9hf6jYbRW9Luo3OEqHV5uyqfBDQLHiZw4WuOvuY92TpWhkSlQsaN\nZiAQNR/lAgMBAAECggEAFDPJt3cCNx+8KaZCqCycVM630BqGGf3mHJefYwrFB+Ma\nbkbeoHAoi+TSQVuBb63kAdVLO6zwrwWAv76Oogul20oYTpo0DBLxz0/LsVRyq5Ee\nW8G0hsUU1tpOaGUQAAHubaTnXiNbWv5CcD3TSh1Vtqmvbe5VfBF5AjRtvafr5moV\n/oiKiOHNCR4QkA0kgynafTGd4UkZ+xekh6nWAQcj/Ri4BgYYckzHgIn/4eXE88p4\nq1L6JF2w0GHolLHaDpDBqii4CIhSc7i1jmM5NRriGfetzinR+ch0dVOt0ssdpkKK\nRxntHK93MhBUfue9/ZKZXlRhk7LfNyqfL99ghkhrIQKBgQDQ+QP7PD+JKuw+b6r/\n+2KhOu3A8diWp0euJsiHBZeIzmx9c7XLHbnv/H3NhLdhfGIBel/YmrT+hGUFiCa2\nn+7hGdgBjVt0EYGSxfBqwDpgBsPfEqsCIhOiLQSuECjfJV2XTRffhwVVzAwEQMan\nRwY6Pfh7VBSC7YTT7tCW+ohkMQKBgQC2f3uy4m7PdvONybirm8GTvVB0HdR0uDig\nDdg7iRuDc1YVksQ3I9Mi0TUTeTDNxeZ6eKNNY9EwIi61FKlacg11dXBCgXIHxwvw\ntBmeJffYLLijrF5D30pYJ39sO6YTdhXz8k2JQ9kjPygRA35ynD7frUYY8wVh2vId\n7VzSiBLN9QKBgQC7yMl3HnDJxjR89sPnO/Mph1bXJ3zGBJQ5Psi77J57ZWPAaBZY\n6+GGcpJp+HUBuVAom9s9PeifPkG5ctu3CIkU5rBg1LH13zVcu1qL0ymh2PJypsfd\nJtoVxfaKpCLWQzGTIMfvKdn1e2Lg/4SEFSLXe5Cq4qd78AyIlfYDdibSQQKBgQCW\n/KAN3tLI6GQWgbc8m4+rOKywdOGKc8O9f9bPYt8VgT3OeW/HIueS3b1o0BJ1nG7U\nEq0g4R14Ej/MZzy57LR/4lqvW0NJdf/F9GthcY49FrroBzqyMrHif0Mn+62qWj9N\nZcGKeEzUoXL3qIlcLzLBgi0FhUKFtEwuQGxUbmoIZQKBgQCYL9by084u0Sf1858i\narIVVg6baBK+MDnxG0YNTVovZv31n30Q0saT0KYZEVWbPDYL/Q+0zfn4qmI1L9Sd\nHedzZ1fREI61XwP/WxDojgZ22fkGbMVfnINjBwbWFMiKf5k7wQzhI5t+oOT0l9ti\n2GsPiT9eCYuz6WV4+Q9RXolrhQ==\n-----END PRIVATE KEY-----\n", | |
"client_email": os.environ.get("client_email"), | |
"client_id": os.environ.get("client_id"), | |
"auth_uri": "https://accounts.google.com/o/oauth2/auth", | |
"token_uri": os.environ.get("token_uri"), | |
"auth_provider_x509_cert_url": os.environ.get("auth_provider_x509_cert_url"), | |
"client_x509_cert_url": os.environ.get("client_x509_cert_url"), | |
"universe_domain": "googleapis.com" | |
} | |
SCOPES = ['https://www.googleapis.com/auth/drive'] | |
def authenticate(): | |
# Authenticate using service account credentials | |
creds = service_account.Credentials.from_service_account_info(SERVICE_ACCOUNT_INFO, scopes=['https://www.googleapis.com/auth/drive']) | |
# Build the Drive API service | |
# service = build('drive', 'v3', credentials=creds) | |
return creds | |
# creds = service_account.Credentials.from_service_account_file( | |
# SERVICE_ACCOUNT_FILE, scopes=['https://www.googleapis.com/auth/drive']) | |
# Check if token file exists | |
# if os.path.exists('token.json'): | |
# creds = Credentials.from_authorized_user_file('token.json') | |
# If no valid credentials available, ask the user to login | |
# if not creds or not creds.valid: | |
# if creds and creds.expired and creds.refresh_token: | |
# creds.refresh(Request()) | |
# else: | |
# # flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES) | |
# flow = InstalledAppFlow.from_client_config(config,SCOPES) | |
# creds = flow.run_local_server(port=0) | |
# # Save the credentials for next run | |
# with open('token.json', 'w') as token: | |
# token.write(creds.to_json()) | |
MAPPING_FILENAME = "Data Mapping with ItemCode.xlsx" | |
def convert_pdf_to_excel(pdf_file): | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
pages_no = len(inputpdf.pages) | |
whole_data = [] | |
for i in range(pages_no): | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
# output = PyPDF2.PdfWriter() | |
# output.add_page(inputpdf.pages[i]) | |
pageObj = inputpdf.pages[i] | |
page_content = pageObj.extract_text() | |
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]: | |
data = each_table.split('\n') | |
each_table_data = [] | |
date_qty = [] | |
row_start_index = 0 | |
row_stop_index = 0 | |
year = "" | |
for index in range(len(data)): | |
if data[index].strip() == 'Part No.': | |
each_table_data.append(data[index+1].replace('Part Color Code',"")) | |
if 'Part Name' not in data[index+2]: | |
each_table_data.append(data[index+2].replace('Part Color Code',"")) | |
else: | |
each_table_data.append("") | |
if data[index].strip()=='MORIROKU TECHNOLOGY': | |
try: | |
year = data[index+1].split(' ')[0].split('/')[1] | |
except Exception as e: | |
print(e) | |
year = "" | |
if 'Part Name' in data[index].strip(): | |
each_table_data.append(data[index+1].replace("Sched","")) | |
if 'Inventory Category' in data[index].strip(): | |
each_table_data.append(data[index+1].replace('Receive Type',"")) | |
if data[index].strip() == 'ADJ': | |
row_start_index = index + 1 | |
if data[index].strip() == 'Total': | |
row_stop_index = index | |
if row_start_index>0 and row_stop_index>0: | |
for index in range(row_start_index,row_stop_index): | |
if '/' in data[index].strip(): | |
date_qty.append([data[index].strip()[-5:].strip() + "/"+year,data[index+1].strip()]) | |
if not date_qty: | |
date_qty = [["",""]] | |
each_table_data.append(date_qty) | |
whole_data.append(each_table_data) | |
whole_data = pd.DataFrame(whole_data) | |
whole_data.columns = ["Part No.","Part Color Code","Part Name",'Inventory Category','Date Qty'] | |
extracted_file = "Data Extracted.xlsx" | |
data_for_mapping = "Data Mapping.xlsx" | |
extracted_data_for_mapping = whole_data.drop(['Inventory Category','Date Qty'],axis=1) | |
extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"]) | |
extracted_data_for_mapping.columns = ['Customer Part no as per pdf','Customer Part color as per pdf','Customer Part name as per pdf'] | |
extracted_data_for_mapping['Item Code'] = "" | |
whole_data.to_excel(extracted_file, index=False) | |
extracted_data_for_mapping.to_excel(data_for_mapping, index=False) | |
return extracted_file,data_for_mapping | |
def map_data_to_template(excel_file, mapping_file): | |
# Load Excel file and mapping file | |
extracted_data = pd.read_excel(excel_file) | |
mapping_data = pd.read_excel(mapping_file) | |
mapping_data = mapping_data.drop_duplicates(subset=['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf','Item Code']) | |
mapping_data.to_excel(MAPPING_FILENAME,index=False) | |
save_mapping_file_to_drive() | |
mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.','Customer Part name as per pdf':'Part Name','Customer Part color as per pdf':'Part Color Code'}) | |
# Perform mapping | |
extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x)) | |
extracted_data = extracted_data.explode('Date Qty') | |
extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index) | |
extracted_data = extracted_data.drop('Date Qty',axis=1) | |
extracted_data = extracted_data[~extracted_data['SchDate'].isna()] | |
mapped_data = extracted_data.merge(mapping_data, on =["Part No.","Part Name","Part Color Code"],how='outer')[['Item Code','SchDate','Qty','Inventory Category']] | |
mapped_data = mapped_data[~mapped_data["SchDate"].isna()] | |
mapped_data = mapped_data[~mapped_data["SchDate"].str.strip().isin(["",None])] | |
mapped_data['SOType'] = "R" | |
mapped_data['SchDate'] = mapped_data['SchDate'].astype("str") | |
return mapped_data[["SchDate","SOType","Item Code","Qty","Inventory Category"]] | |
def save_mapping_file_to_drive(): | |
# creds = Credentials.from_authorized_user_info(credentials_dict) | |
creds = authenticate() | |
service = build('drive', 'v3', credentials=creds) | |
# Authenticate with Google Drive API | |
# service = build('drive', 'v3', credentials=creds) | |
# List all files in the folder | |
# results = service.files().list( | |
# q=f"'{folder_id}' in parents and mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", | |
# fields="files(id, name)").execute() | |
results = service.files().list( | |
q=f"mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", | |
fields="files(id, name)").execute() | |
files = results.get('files', []) | |
files = [i for i in files if i.get('name')==MAPPING_FILENAME] | |
if not files: | |
print('No Excel Mapping files found in the folder.') | |
else: | |
for file in files: | |
# Get the ID and name of the first Excel file found in the folder | |
existing_file_id = file['id'] | |
existing_file_name = file['name'] | |
# Delete the existing file | |
service.files().delete(fileId=existing_file_id).execute() | |
file_metadata = {'name': MAPPING_FILENAME } | |
media = MediaFileUpload(MAPPING_FILENAME, mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') | |
service.files().create(body=file_metadata, media_body=media, fields='id').execute() | |
def pull_mapping_file_from_drive(): | |
creds = authenticate() | |
# service = build('drive', 'v3', credentials=creds) | |
# creds = Credentials.from_authorized_user_info(credentials_dict) | |
# Authenticate with Google Drive API | |
service = build('drive', 'v3', credentials=creds) | |
results = service.files().list( | |
q="mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", | |
fields="files(id, name)").execute() | |
files = results.get('files', []) | |
files = [i for i in files if i.get('name')==MAPPING_FILENAME] | |
if files: | |
file_id = files[0]['id'] | |
file_name = files[0]['name'] | |
request = service.files().get_media(fileId=file_id) | |
fh = open(file_name, 'wb') | |
downloader = MediaIoBaseDownload(fh, request) | |
# Execute the download | |
done = False | |
while not done: | |
status, done = downloader.next_chunk() | |
fh.close() | |
return 1 | |
return 0 | |
def delete_master_file(): | |
creds = authenticate() | |
service = build('drive', 'v3', credentials=creds) | |
# Authenticate with Google Drive API | |
# service = build('drive', 'v3', credentials=creds) | |
folder_id = "1HBRUZePST0D0buyU9MxeYg2vQyEL4wLF" | |
# List all files in the folder | |
results = service.files().list( | |
q=f"mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", | |
fields="files(id, name)").execute() | |
files = results.get('files', []) | |
files = [i for i in files if i.get('name')==MAPPING_FILENAME] | |
if not files: | |
print('No Excel Mapping files found in the folder.') | |
else: | |
for file in files: | |
# Get the ID and name of the first Excel file found in the folder | |
existing_file_id = file['id'] | |
existing_file_name = file['name'] | |
# Delete the existing file | |
service.files().delete(fileId=existing_file_id).execute() | |
print("Deleted master file") | |
def main(): | |
# Load your logo image | |
logo_path = "logo.jpeg" | |
logo_base64 = get_image_as_base64(logo_path) | |
logo_html = f""" | |
<div style="display: flex; justify-content: center; align-items: center; height: 100px;"> | |
<img src="data:image/jpeg;base64,{logo_base64}" style="width: 100px; height: 100px;"> | |
</div> | |
""" | |
# Display the logo HTML | |
st.markdown(logo_html, unsafe_allow_html=True) | |
st.markdown("<h1 style='text-align: center;'>PDF to Excel Converter</h1>", unsafe_allow_html=True) | |
# File uploader | |
st.markdown("### STEP 1") | |
st.markdown("#### Upload a PDF File") | |
uploaded_file = st.file_uploader("### Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
st.write("Uploaded PDF file:", uploaded_file.name) | |
# Convert PDF to Excel | |
extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file) | |
file_present = pull_mapping_file_from_drive() | |
if file_present: | |
try: | |
mapping_data_from_drive = pd.read_excel(MAPPING_FILENAME) | |
extracted_data_for_mapping = pd.read_excel(data_for_mapping) | |
extracted_data_for_mapping.columns = [i.strip() for i in extracted_data_for_mapping.columns] | |
mapping_data_from_drive.columns = [i.strip() for i in mapping_data_from_drive.columns if "inventory category" not in i.lower()] | |
mapping_data_from_drive = mapping_data_from_drive.drop_duplicates(subset=['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf','Item Code']) | |
extracted_data_for_mapping = extracted_data_for_mapping[['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf']].merge(mapping_data_from_drive, on = ['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf'], how='outer') | |
extracted_data_for_mapping.to_excel(data_for_mapping,index=False) | |
except Exception as e: | |
st.error("Error in the Mapping Master file on Cloud. " + str(e)) | |
st.error("Please reupload the Data Master file with Item Code mapping") | |
delete_master_file() | |
file_present = None | |
# Download link for the Excel file | |
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})") | |
if os.path.exists(data_for_mapping): | |
with open(data_for_mapping, "rb") as f: | |
excel_bytes = f.read() | |
st.download_button( | |
label="Download Excel file", | |
data=excel_bytes, | |
file_name=data_for_mapping, | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("Error: Converted Excel file not found") | |
st.markdown("##### Click the button below if you want to upload a new mapping file") | |
if st.button("Delete Mapping file in Cloud", key="delete"): | |
delete_master_file() | |
file_present = pull_mapping_file_from_drive() | |
st.markdown("### STEP 2") | |
mapping_uploaded_file = None | |
if not file_present: | |
st.markdown("#### Upload the Data Master file with Item Code mapping") | |
mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"]) | |
else: | |
mapping_data = pd.read_excel(MAPPING_FILENAME) | |
# mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'}) | |
data_for_mapping = "Data Mapping.xlsx" | |
extracted_data_for_mapping = pd.read_excel(data_for_mapping) | |
if 'Item Code' not in extracted_data_for_mapping.columns: | |
extracted_data_for_mapping['Item Code'] = "" | |
extracted_data_for_mapping = extracted_data_for_mapping[extracted_data_for_mapping['Item Code'].isna()] | |
unmapped_part_no = extracted_data_for_mapping['Customer Part no as per pdf'].nunique() | |
if unmapped_part_no>0: | |
st.markdown("There are {} Part No. with No ItemCode present. Upload a new file after mapping them".format(unmapped_part_no)) | |
st.markdown("Do you want to skip this or Upload a new Mapping File") | |
if 'button_pressed' not in st.session_state: | |
st.session_state.button_pressed = None | |
# placeholder = st.empty() # Create a placeholder | |
if st.session_state.button_pressed is None: | |
if st.button("Skip"): | |
st.session_state.button_pressed = "Skip" | |
# placeholder.empty() # Clear the placeholder content | |
if st.button("Upload a new Master Mapping"): | |
st.session_state.button_pressed = "Upload a new Master Mapping" | |
# placeholder.empty() # Clear the placeholder content | |
if st.session_state.button_pressed is not None: | |
# Common block of code that uses the variable | |
if st.session_state.button_pressed == "Skip": | |
mapping_uploaded_file = MAPPING_FILENAME | |
# Add your code that runs when Yes is pressed | |
elif st.session_state.button_pressed == "Upload a new Master Mapping": | |
mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"]) | |
# Add your code that runs when No is pressed | |
else: | |
st.markdown("All Part No. are mapped with ItemCode so using the Mapping file available in Google Drive") | |
mapping_uploaded_file = MAPPING_FILENAME | |
if mapping_uploaded_file is not None: | |
# st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name) | |
# Perform data mapping | |
mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file) | |
# Provide a link to download the final Excel file after mapping | |
st.markdown("### FINAL DOWNLOAD") | |
st.markdown("Final Excel File After Mapping") | |
final_excel_file = 'Final Data.xlsx' | |
mapped_data.to_excel(final_excel_file, index=False,engine='openpyxl') | |
if os.path.exists(final_excel_file): | |
with open(final_excel_file, "rb") as f: | |
excel_bytes = f.read() | |
st.download_button( | |
label="Download Excel file", | |
data=excel_bytes, | |
file_name=final_excel_file, | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("Error: Converted Excel file not found") | |
st.session_state.button_pressed = None | |
if __name__ == "__main__": | |
main() |