Spaces:

hardik27
/

dataextraction

Running

App Files Files Community

hardik27 commited on Apr 16, 2024

Commit

9865c91

verified ·

1 Parent(s): 9d22702

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -27

app.py CHANGED Viewed

@@ -6,18 +6,43 @@ import streamlit as st
 import pandas as pd
 import os
 from google.oauth2.credentials import Credentials
 from googleapiclient.discovery import build
 from googleapiclient.http import MediaIoBaseDownload,MediaFileUpload
 # Load credentials from environment variables
-credentials_dict = {
-    "token": os.environ.get("token"),
-    "refresh_token": os.environ.get("refresh_token"),
-    "token_uri": os.environ.get("token_uri"),
-    "client_id": os.environ.get("client_id"),
-    "client_secret": os.environ.get("client_secret"),
-    "scopes": [os.environ.get("scopes")]
-}
 MAPPING_FILENAME = "Data Mapping with ItemCode.xlsx"
@@ -75,8 +100,11 @@ def convert_pdf_to_excel(pdf_file):
     whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty']
     extracted_file = "Data Extracted.xlsx"
     data_for_mapping = "Data Mapping.xlsx"
     extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1)
     extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"])
     whole_data.to_excel(extracted_file, index=False)
     extracted_data_for_mapping.to_excel(data_for_mapping, index=False)
     return extracted_file,data_for_mapping
@@ -85,7 +113,7 @@ def map_data_to_template(excel_file, mapping_file):
     # Load Excel file and mapping file
     extracted_data = pd.read_excel(excel_file)
     mapping_data = pd.read_excel(mapping_file)
-    mapping_data.to_excel(MAPPING_FILENAME)
     save_mapping_file_to_drive()
     mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
@@ -102,7 +130,9 @@ def map_data_to_template(excel_file, mapping_file):
     return mapped_data
 def save_mapping_file_to_drive():
-    creds = Credentials.from_authorized_user_info(credentials_dict)
     # Authenticate with Google Drive API
     service = build('drive', 'v3', credentials=creds)
     folder_id = "1HBRUZePST0D0buyU9MxeYg2vQyEL4wLF"
@@ -112,7 +142,7 @@ def save_mapping_file_to_drive():
         q=f"'{folder_id}' in parents and mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'",
         fields="files(id, name)").execute()
     files = results.get('files', [])
-    files = [i for i in files if i.get('name')=='Data Mapping with ItemCode.xlsx']
     if not files:
         print('No Excel Mapping files found in the folder.')
@@ -130,7 +160,9 @@ def save_mapping_file_to_drive():
     service.files().create(body=file_metadata, media_body=media, fields='id').execute()
 def pull_mapping_file_from_drive():
-    creds = Credentials.from_authorized_user_info(credentials_dict)
     # Authenticate with Google Drive API
     service = build('drive', 'v3', credentials=creds)
@@ -138,7 +170,7 @@ def pull_mapping_file_from_drive():
         q="mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'",
         fields="files(id, name)").execute()
     files = results.get('files', [])
-    files = [i for i in files if i.get('name')=='Data Mapping with ItemCode.xlsx']
     if files:
         file_id = files[0]['id']
         file_name = files[0]['name']
@@ -153,8 +185,9 @@ def pull_mapping_file_from_drive():
         fh.close()
         return 1
-    return 0
 def main():
     st.title("PDF to Excel Converter")
@@ -168,6 +201,14 @@ def main():
         # Convert PDF to Excel
         extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file)
         # Download link for the Excel file
         # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
@@ -183,23 +224,26 @@ def main():
         else:
             st.error("Error: Converted Excel file not found")
-        file_present = pull_mapping_file_from_drive()
-        if not os.path.exists("Data Mapping with ItemCode.xlsx"):
             st.markdown("## Upload the Data Master file with Item Code mapping")
             mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
         else:
-            mapping_data = pd.read_excel("Data Mapping with ItemCode.xlsx")
-            mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
-            data_for_mapping = "Data Mapping.xlsx"
-            extracted_data_for_mapping = pd.read_excel(data_for_mapping)
-            extracted_data_for_mapping = extracted_data_for_mapping[~extracted_data_for_mapping['Part No.'].isin(mapping_data['Part No.'])]
-            unmapped_part_no = extracted_data_for_mapping['Part No.'].nunique()
-            if unmapped_part_no>0:
-                st.markdown("#### There are {} Part No. with No ItemCode present. Upload a new file after mapping them".format(unmapped_part_no))
-                mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
-            else:
                 st.markdown("#### Using the Mapping file available in Google Drive")
-                mapping_uploaded_file =  "Data Mapping with ItemCode.xlsx"
         if mapping_uploaded_file is not None:
             # st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name)

 import pandas as pd
 import os
 from google.oauth2.credentials import Credentials
+from google.auth.transport.requests import Request
+from google_auth_oauthlib.flow import InstalledAppFlow
 from googleapiclient.discovery import build
 from googleapiclient.http import MediaIoBaseDownload,MediaFileUpload
 # Load credentials from environment variables
+config = {'installed': {'client_id': os.environ.get("client_id"),
+  'project_id': os.environ.get("project_id"),
+  'auth_uri': os.environ.get("auth_uri"),
+  'token_uri': os.environ.get("token_uri"),
+  'auth_provider_x509_cert_url': os.environ.get("auth_provider_x509_cert_url"),
+  'client_secret': os.environ.get("client_secret"),
+  'redirect_uris': ['http://localhost']}}
+SCOPES = ['https://www.googleapis.com/auth/drive']
+def authenticate():
+    creds = None
+    # Check if token file exists
+    if os.path.exists('token.json'):
+        creds = Credentials.from_authorized_user_file('token.json')
+    # If no valid credentials available, ask the user to login
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            # flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
+            flow = InstalledAppFlow.from_client_config(config,SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for next run
+        with open('token.json', 'w') as token:
+            token.write(creds.to_json())
+    return creds
 MAPPING_FILENAME = "Data Mapping with ItemCode.xlsx"
     whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty']
     extracted_file = "Data Extracted.xlsx"
     data_for_mapping = "Data Mapping.xlsx"
     extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1)
     extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"])
+    extracted_data_for_mapping.columns = ['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color  as per pdf']
     whole_data.to_excel(extracted_file, index=False)
     extracted_data_for_mapping.to_excel(data_for_mapping, index=False)
     return extracted_file,data_for_mapping
     # Load Excel file and mapping file
     extracted_data = pd.read_excel(excel_file)
     mapping_data = pd.read_excel(mapping_file)
+    mapping_data.to_excel(MAPPING_FILENAME,index=False)
     save_mapping_file_to_drive()
     mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
     return mapped_data
 def save_mapping_file_to_drive():
+    # creds = Credentials.from_authorized_user_info(credentials_dict)
+    creds = authenticate()
+    service = build('drive', 'v3', credentials=creds)
     # Authenticate with Google Drive API
     service = build('drive', 'v3', credentials=creds)
     folder_id = "1HBRUZePST0D0buyU9MxeYg2vQyEL4wLF"
         q=f"'{folder_id}' in parents and mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'",
         fields="files(id, name)").execute()
     files = results.get('files', [])
+    files = [i for i in files if i.get('name')==MAPPING_FILENAME]
     if not files:
         print('No Excel Mapping files found in the folder.')
     service.files().create(body=file_metadata, media_body=media, fields='id').execute()
 def pull_mapping_file_from_drive():
+    creds = authenticate()
+    service = build('drive', 'v3', credentials=creds)
+    # creds = Credentials.from_authorized_user_info(credentials_dict)
     # Authenticate with Google Drive API
     service = build('drive', 'v3', credentials=creds)
         q="mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'",
         fields="files(id, name)").execute()
     files = results.get('files', [])
+    files = [i for i in files if i.get('name')==MAPPING_FILENAME]
     if files:
         file_id = files[0]['id']
         file_name = files[0]['name']
         fh.close()
         return 1
+    print('No Excel files found.')
+    return 0
 def main():
     st.title("PDF to Excel Converter")
         # Convert PDF to Excel
         extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file)
+        file_present = pull_mapping_file_from_drive()
+        if file_present:
+            mapping_data_from_drive = pd.read_excel(MAPPING_FILENAME)
+            extracted_data_for_mapping = pd.read_excel(data_for_mapping)
+            extracted_data_for_mapping = extracted_data_for_mapping.merge(mapping_data_from_drive, on = ['Customer Part no as per pdf','Customer Part name as per pdf','Customer Part color  as per pdf'], how='outer')
+            extracted_data_for_mapping.to_excel(data_for_mapping,index=False)
         # Download link for the Excel file
         # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
         else:
             st.error("Error: Converted Excel file not found")
+        if not file_present:
             st.markdown("## Upload the Data Master file with Item Code mapping")
             mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
         else:
+            try:
+                mapping_data = pd.read_excel(MAPPING_FILENAME)
+                # mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
+                data_for_mapping = "Data Mapping.xlsx"
+                extracted_data_for_mapping = pd.read_excel(data_for_mapping)
+                extracted_data_for_mapping = extracted_data_for_mapping[~extracted_data_for_mapping['Customer Part no as per pdf'].isin(mapping_data['Customer Part no as per pdf'])]
+                unmapped_part_no = extracted_data_for_mapping['Customer Part no as per pdf'].nunique()
+                if unmapped_part_no>0:
+                    st.markdown("#### There are {} Part No. with No ItemCode present. Upload a new file after mapping them".format(unmapped_part_no))
+                    mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
+                else:
+                    st.markdown("#### Using the Mapping file available in Google Drive")
+                    mapping_uploaded_file =  MAPPING_FILENAME
+            except:
                 st.markdown("#### Using the Mapping file available in Google Drive")
+                mapping_uploaded_file =  MAPPING_FILENAME
         if mapping_uploaded_file is not None:
             # st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name)