import PyPDF2 import pandas as pd import os import ast import streamlit as st import pandas as pd import os from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from googleapiclient.http import MediaIoBaseDownload,MediaFileUpload # Load credentials from environment variables config = {'installed': {'client_id': os.environ.get("client_id"), 'project_id': os.environ.get("project_id"), 'auth_uri': os.environ.get("auth_uri"), 'token_uri': os.environ.get("token_uri"), 'auth_provider_x509_cert_url': os.environ.get("auth_provider_x509_cert_url"), 'client_secret': os.environ.get("client_secret"), 'redirect_uris': ['http://localhost']}} SCOPES = ['https://www.googleapis.com/auth/drive'] def authenticate(): creds = None # Check if token file exists if os.path.exists('token.json'): creds = Credentials.from_authorized_user_file('token.json') # If no valid credentials available, ask the user to login if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: # flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES) flow = InstalledAppFlow.from_client_config(config,SCOPES) creds = flow.run_local_server(port=0) # Save the credentials for next run with open('token.json', 'w') as token: token.write(creds.to_json()) return creds MAPPING_FILENAME = "Data Mapping with ItemCode.xlsx" def convert_pdf_to_excel(pdf_file): inputpdf = PyPDF2.PdfReader(pdf_file) pages_no = len(inputpdf.pages) whole_data = [] for i in range(pages_no): inputpdf = PyPDF2.PdfReader(pdf_file) # output = PyPDF2.PdfWriter() # output.add_page(inputpdf.pages[i]) pageObj = inputpdf.pages[i] page_content = pageObj.extract_text() for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]: data = each_table.split('\n') each_table_data = [] date_qty = [] row_start_index = 0 row_stop_index = 0 year = "" for index in range(len(data)): if data[index].strip() == 'Part No.': each_table_data.append(data[index+1].replace('Part Color Code',"")) if 'Part Name' not in data[index+2]: each_table_data.append(data[index+2].replace('Part Color Code',"")) else: each_table_data.append("") if data[index].strip()=='MORIROKU TECHNOLOGY': try: year = data[index+1].split(' ')[0].split('/')[1] except Exception as e: print(e) year = "" if 'Part Name' in data[index].strip(): each_table_data.append(data[index+1]) if data[index].strip() == 'ADJ': row_start_index = index + 1 if data[index].strip() == 'Total': row_stop_index = index if row_start_index>0 and row_stop_index>0: for index in range(row_start_index,row_stop_index): if '/' in data[index].strip(): date_qty.append([data[index].strip()[-5:].strip() + "/"+year,data[index+1].strip()]) if not date_qty: date_qty = [["",""]] each_table_data.append(date_qty) whole_data.append(each_table_data) whole_data = pd.DataFrame(whole_data) whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty'] extracted_file = "Data Extracted.xlsx" data_for_mapping = "Data Mapping.xlsx" extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1) extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"]) extracted_data_for_mapping.columns = ['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf'] whole_data.to_excel(extracted_file, index=False) extracted_data_for_mapping.to_excel(data_for_mapping, index=False) return extracted_file,data_for_mapping def map_data_to_template(excel_file, mapping_file): # Load Excel file and mapping file extracted_data = pd.read_excel(excel_file) mapping_data = pd.read_excel(mapping_file) mapping_data.to_excel(MAPPING_FILENAME,index=False) save_mapping_file_to_drive() mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'}) # Perform mapping extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x)) extracted_data = extracted_data.explode('Date Qty') extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index) extracted_data = extracted_data.drop('Date Qty',axis=1) extracted_data = extracted_data[~extracted_data['SchDate'].isna()] mapped_data = extracted_data.merge(mapping_data, on =['Part No.'],how='outer')[['Item Code','SchDate','Qty']] mapped_data['SOType'] = "R" mapped_data = mapped_data[~mapped_data["SchDate"].isna()] return mapped_data def save_mapping_file_to_drive(): # creds = Credentials.from_authorized_user_info(credentials_dict) creds = authenticate() service = build('drive', 'v3', credentials=creds) # Authenticate with Google Drive API service = build('drive', 'v3', credentials=creds) folder_id = "1HBRUZePST0D0buyU9MxeYg2vQyEL4wLF" # List all files in the folder results = service.files().list( q=f"'{folder_id}' in parents and mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", fields="files(id, name)").execute() files = results.get('files', []) files = [i for i in files if i.get('name')==MAPPING_FILENAME] if not files: print('No Excel Mapping files found in the folder.') else: for file in files: # Get the ID and name of the first Excel file found in the folder existing_file_id = file['id'] existing_file_name = file['name'] # Delete the existing file service.files().delete(fileId=existing_file_id).execute() file_metadata = {'name': MAPPING_FILENAME, 'parents': [folder_id]} media = MediaFileUpload(MAPPING_FILENAME, mimetype='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') service.files().create(body=file_metadata, media_body=media, fields='id').execute() def pull_mapping_file_from_drive(): creds = authenticate() service = build('drive', 'v3', credentials=creds) # creds = Credentials.from_authorized_user_info(credentials_dict) # Authenticate with Google Drive API service = build('drive', 'v3', credentials=creds) results = service.files().list( q="mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'", fields="files(id, name)").execute() files = results.get('files', []) files = [i for i in files if i.get('name')==MAPPING_FILENAME] if files: file_id = files[0]['id'] file_name = files[0]['name'] request = service.files().get_media(fileId=file_id) fh = open(file_name, 'wb') downloader = MediaIoBaseDownload(fh, request) # Execute the download done = False while not done: status, done = downloader.next_chunk() fh.close() return 1 print('No Excel files found.') return 0 def main(): st.title("PDF to Excel Converter") # File uploader uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file is not None: st.write("Uploaded PDF file:", uploaded_file.name) # Convert PDF to Excel extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file) file_present = pull_mapping_file_from_drive() if file_present: mapping_data_from_drive = pd.read_excel(MAPPING_FILENAME) extracted_data_for_mapping = pd.read_excel(data_for_mapping) extracted_data_for_mapping = extracted_data_for_mapping.merge(mapping_data_from_drive, on = ['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color as per pdf'], how='outer') extracted_data_for_mapping.to_excel(data_for_mapping,index=False) # Download link for the Excel file # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})") if os.path.exists(data_for_mapping): with open(data_for_mapping, "rb") as f: excel_bytes = f.read() st.download_button( label="Download Excel file", data=excel_bytes, file_name=data_for_mapping, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.error("Error: Converted Excel file not found") if not file_present: st.markdown("## Upload the Data Master file with Item Code mapping") mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"]) else: try: mapping_data = pd.read_excel(MAPPING_FILENAME) # mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'}) data_for_mapping = "Data Mapping.xlsx" extracted_data_for_mapping = pd.read_excel(data_for_mapping) extracted_data_for_mapping = extracted_data_for_mapping[~extracted_data_for_mapping['Customer Part no as per pdf'].isin(mapping_data['Customer Part no as per pdf'])] unmapped_part_no = extracted_data_for_mapping['Customer Part no as per pdf'].nunique() if unmapped_part_no>0: st.markdown("#### There are {} Part No. with No ItemCode present. Upload a new file after mapping them".format(unmapped_part_no)) mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"]) else: st.markdown("#### Using the Mapping file available in Google Drive") mapping_uploaded_file = MAPPING_FILENAME except: st.markdown("#### Using the Mapping file available in Google Drive") mapping_uploaded_file = MAPPING_FILENAME if mapping_uploaded_file is not None: # st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name) # Perform data mapping mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file) # Provide a link to download the final Excel file after mapping st.markdown("### Final Excel File After Mapping") final_excel_file = 'Final Data.xlsx' mapped_data.to_excel(final_excel_file, index=False) if os.path.exists(final_excel_file): with open(final_excel_file, "rb") as f: excel_bytes = f.read() st.download_button( label="Download Excel file", data=excel_bytes, file_name=final_excel_file, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.error("Error: Converted Excel file not found") if __name__ == "__main__": main()