Spaces:
Running
Running
import PyPDF2 | |
import pandas as pd | |
import os | |
import ast | |
import streamlit as st | |
import pandas as pd | |
def convert_pdf_to_excel(pdf_file): | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
pages_no = len(inputpdf.pages) | |
whole_data = [] | |
for i in range(pages_no): | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
# output = PyPDF2.PdfWriter() | |
# output.add_page(inputpdf.pages[i]) | |
pageObj = inputpdf.pages[i] | |
page_content = pageObj.extract_text() | |
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]: | |
data = each_table.split('\n') | |
each_table_data = [] | |
date_qty = [] | |
row_start_index = 0 | |
row_stop_index = 0 | |
for index in range(len(data)): | |
if data[index].strip() == 'Part No.': | |
each_table_data.append(data[index+1].replace('Part Color Code',"")) | |
if 'Part Name' not in data[index+2]: | |
each_table_data.append(data[index+2].replace('Part Color Code',"")) | |
else: | |
each_table_data.append("") | |
if 'Part Name' in data[index].strip(): | |
each_table_data.append(data[index+1]) | |
if data[index].strip() == 'ADJ': | |
row_start_index = index + 1 | |
if data[index].strip() == 'Total': | |
row_stop_index = index | |
if row_start_index>0 and row_stop_index>0: | |
for index in range(row_start_index,row_stop_index): | |
if '/' in data[index].strip(): | |
date_qty.append([data[index].strip()[-5:].strip(),data[index+1].strip()]) | |
if not date_qty: | |
date_qty = [["",""]] | |
each_table_data.append(date_qty) | |
whole_data.append(each_table_data) | |
whole_data = pd.DataFrame(whole_data) | |
whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty'] | |
extracted_file = "Data Extracted.xlsx" | |
data_for_mapping = "Data Mapping.xlsx" | |
extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1) | |
extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"]) | |
whole_data.to_excel(extracted_file, index=False) | |
extracted_data_for_mapping.to_excel(data_for_mapping, index=False) | |
return extracted_file,data_for_mapping | |
def map_data_to_template(excel_file, mapping_file): | |
# Load Excel file and mapping file | |
extracted_data = pd.read_excel(excel_file) | |
mapping_data = pd.read_excel(mapping_file) | |
mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'}) | |
# Perform mapping | |
extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x)) | |
extracted_data = extracted_data.explode('Date Qty') | |
extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index) | |
extracted_data = extracted_data.drop('Date Qty',axis=1) | |
mapped_data = extracted_data.merge(mapping_data, on =['Part No.'])[['Item Code','SchDate','Qty']] | |
return mapped_data | |
def main(): | |
st.title("PDF to Excel Converter") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
st.write("Uploaded PDF file:", uploaded_file.name) | |
# Convert PDF to Excel | |
extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file) | |
# Download link for the Excel file | |
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})") | |
if os.path.exists(data_for_mapping): | |
with open(data_for_mapping, "rb") as f: | |
excel_bytes = f.read() | |
st.download_button( | |
label="Download Excel file", | |
data=excel_bytes, | |
file_name=data_for_mapping, | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("Error: Converted Excel file not found") | |
st.markdown("## Upload the Data Master file with Item Code mapping") | |
mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"]) | |
if mapping_uploaded_file is not None: | |
st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name) | |
# Perform data mapping | |
mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file) | |
# Provide a link to download the final Excel file after mapping | |
st.markdown("### Final Excel File After Mapping") | |
final_excel_file = 'Final Data.xlsx' | |
mapped_data.to_excel(final_excel_file, index=False) | |
if os.path.exists(final_excel_file): | |
with open(final_excel_file, "rb") as f: | |
excel_bytes = f.read() | |
st.download_button( | |
label="Download Excel file", | |
data=excel_bytes, | |
file_name=final_excel_file, | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("Error: Converted Excel file not found") | |
if __name__ == "__main__": | |
main() |