Spaces:

hardik27
/

dataextraction

Running

App Files Files Community

hardik27 commited on Apr 5, 2024

Commit

3ad62d6

verified ·

1 Parent(s): 27a08f1

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -11

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import PyPDF2
 import pandas as pd
 import os
 import streamlit as st
 import pandas as pd
 def convert_pdf_to_excel(pdf_file):
-    # Use tabula to extract tables from PDF
     inputpdf = PyPDF2.PdfReader(pdf_file)
     pages_no = len(inputpdf.pages)
     whole_data = []
@@ -19,6 +18,9 @@ def convert_pdf_to_excel(pdf_file):
         for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
             data = each_table.split('\n')
             each_table_data = []
             for index in range(len(data)):
                 if data[index].strip() == 'Part No.':
                     each_table_data.append(data[index+1].replace('Part Color Code',""))
@@ -29,14 +31,46 @@ def convert_pdf_to_excel(pdf_file):
                 if 'Part Name' in data[index].strip():
                     each_table_data.append(data[index+1])
             whole_data.append(each_table_data)
     whole_data = pd.DataFrame(whole_data)
-    whole_data.columns = ["Part No.","Part Color Code","Part Name"]
-    excel_file = pdf_file.name.replace('.pdf', '.xlsx')
-    whole_data.to_excel(excel_file, index=False)
-    return excel_file
 def main():
     st.title("PDF to Excel Converter")
@@ -48,22 +82,54 @@ def main():
         st.write("Uploaded PDF file:", uploaded_file.name)
         # Convert PDF to Excel
-        excel_file = convert_pdf_to_excel(uploaded_file)
         # Download link for the Excel file
         # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
-        if os.path.exists(excel_file):
-            with open(excel_file, "rb") as f:
                 excel_bytes = f.read()
             st.download_button(
                 label="Download Excel file",
                 data=excel_bytes,
-                file_name=excel_file,
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
         else:
             st.error("Error: Converted Excel file not found")
 if __name__ == "__main__":
     main()

 import PyPDF2
 import pandas as pd
 import os
+import ast
 import streamlit as st
 import pandas as pd
 def convert_pdf_to_excel(pdf_file):
     inputpdf = PyPDF2.PdfReader(pdf_file)
     pages_no = len(inputpdf.pages)
     whole_data = []
         for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
             data = each_table.split('\n')
             each_table_data = []
+            date_qty = []
+            row_start_index = 0
+            row_stop_index = 0
             for index in range(len(data)):
                 if data[index].strip() == 'Part No.':
                     each_table_data.append(data[index+1].replace('Part Color Code',""))
                 if 'Part Name' in data[index].strip():
                     each_table_data.append(data[index+1])
+                if data[index].strip() == 'ADJ':
+                    row_start_index = index + 1
+                if data[index].strip() == 'Total':
+                    row_stop_index = index
+            if row_start_index>0 and row_stop_index>0:
+                for index in range(row_start_index,row_stop_index):
+                    if '/' in data[index].strip():
+                        date_qty.append([data[index].strip()[-5:].strip(),data[index+1].strip()])
+            if not date_qty:
+                date_qty = [["",""]]
+            each_table_data.append(date_qty)
             whole_data.append(each_table_data)
     whole_data = pd.DataFrame(whole_data)
+    whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty']
+    extracted_file = "Data Extracted.xlsx"
+    data_for_mapping = "Data Mapping.xlsx"
+    extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1)
+    extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"])
+    whole_data.to_excel(extracted_file, index=False)
+    extracted_data_for_mapping.to_excel(data_for_mapping, index=False)
+    return extracted_file,data_for_mapping
+def map_data_to_template(excel_file, mapping_file):
+    # Load Excel file and mapping file
+    extracted_data = pd.read_excel(excel_file)
+    mapping_data = pd.read_excel(mapping_file)
+    mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})
+    # Perform mapping
+    extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x))
+    extracted_data = extracted_data.explode('Date Qty')
+    extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index)
+    extracted_data = extracted_data.drop('Date Qty',axis=1)
+    mapped_data = extracted_data.merge(mapping_data, on =['Part No.'])[['Item Code','SchDate','Qty']]
+    return mapped_data
 def main():
     st.title("PDF to Excel Converter")
         st.write("Uploaded PDF file:", uploaded_file.name)
         # Convert PDF to Excel
+        extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file)
         # Download link for the Excel file
         # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
+        if os.path.exists(data_for_mapping):
+            with open(data_for_mapping, "rb") as f:
                 excel_bytes = f.read()
             st.download_button(
                 label="Download Excel file",
                 data=excel_bytes,
+                file_name=data_for_mapping,
                 mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
             )
         else:
             st.error("Error: Converted Excel file not found")
+        st.markdown("## Upload the Data Master file with Item Code mapping")
+        mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])
+        if mapping_uploaded_file is not None:
+            st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name)
+            # Perform data mapping
+            mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file)
+            # Provide a link to download the final Excel file after mapping
+            st.markdown("### Final Excel File After Mapping")
+            final_excel_file = 'Final Data.xlsx'
+            mapped_data.to_excel(final_excel_file, index=False)
+            if os.path.exists(final_excel_file):
+                with open(final_excel_file, "rb") as f:
+                    excel_bytes = f.read()
+                st.download_button(
+                    label="Download Excel file",
+                    data=excel_bytes,
+                    file_name=final_excel_file,
+                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+                )
+            else:
+                st.error("Error: Converted Excel file not found")
 if __name__ == "__main__":
     main()