Spaces:

hardik27
/

dataextraction

Running

App Files Files Community

dataextraction / app.py

hardik27

Update app.py

3ad62d6 verified about 1 year ago

raw

history blame

5.56 kB

	import PyPDF2
	import pandas as pd
	import os
	import ast
	import streamlit as st
	import pandas as pd

	def convert_pdf_to_excel(pdf_file):
	inputpdf = PyPDF2.PdfReader(pdf_file)
	pages_no = len(inputpdf.pages)
	whole_data = []
	for i in range(pages_no):
	inputpdf = PyPDF2.PdfReader(pdf_file)
	# output = PyPDF2.PdfWriter()
	# output.add_page(inputpdf.pages[i])
	pageObj = inputpdf.pages[i]
	page_content = pageObj.extract_text()
	for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
	data = each_table.split('\n')
	each_table_data = []
	date_qty = []
	row_start_index = 0
	row_stop_index = 0
	for index in range(len(data)):
	if data[index].strip() == 'Part No.':
	each_table_data.append(data[index+1].replace('Part Color Code',""))
	if 'Part Name' not in data[index+2]:
	each_table_data.append(data[index+2].replace('Part Color Code',""))
	else:
	each_table_data.append("")

	if 'Part Name' in data[index].strip():
	each_table_data.append(data[index+1])

	if data[index].strip() == 'ADJ':
	row_start_index = index + 1

	if data[index].strip() == 'Total':
	row_stop_index = index

	if row_start_index>0 and row_stop_index>0:
	for index in range(row_start_index,row_stop_index):
	if '/' in data[index].strip():
	date_qty.append([data[index].strip()[-5:].strip(),data[index+1].strip()])
	if not date_qty:
	date_qty = [["",""]]
	each_table_data.append(date_qty)
	whole_data.append(each_table_data)

	whole_data = pd.DataFrame(whole_data)
	whole_data.columns = ["Part No.","Part Color Code","Part Name",'Date Qty']
	extracted_file = "Data Extracted.xlsx"
	data_for_mapping = "Data Mapping.xlsx"
	extracted_data_for_mapping = whole_data.drop('Date Qty',axis=1)
	extracted_data_for_mapping = extracted_data_for_mapping.drop_duplicates(subset=["Part No.","Part Color Code","Part Name"])
	whole_data.to_excel(extracted_file, index=False)
	extracted_data_for_mapping.to_excel(data_for_mapping, index=False)
	return extracted_file,data_for_mapping

	def map_data_to_template(excel_file, mapping_file):
	# Load Excel file and mapping file
	extracted_data = pd.read_excel(excel_file)
	mapping_data = pd.read_excel(mapping_file)
	mapping_data = mapping_data.rename(columns = {'Customer Part no as per pdf':'Part No.'})

	# Perform mapping
	extracted_data['Date Qty'] = extracted_data['Date Qty'].apply(lambda x: ast.literal_eval(x))
	extracted_data = extracted_data.explode('Date Qty')
	extracted_data[['SchDate','Qty']]= pd.DataFrame(extracted_data['Date Qty'].to_list(), index= extracted_data.index)
	extracted_data = extracted_data.drop('Date Qty',axis=1)
	mapped_data = extracted_data.merge(mapping_data, on =['Part No.'])[['Item Code','SchDate','Qty']]

	return mapped_data

	def main():
	st.title("PDF to Excel Converter")

	# File uploader
	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

	if uploaded_file is not None:
	st.write("Uploaded PDF file:", uploaded_file.name)

	# Convert PDF to Excel
	extracted_file,data_for_mapping = convert_pdf_to_excel(uploaded_file)

	# Download link for the Excel file
	# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")

	if os.path.exists(data_for_mapping):
	with open(data_for_mapping, "rb") as f:
	excel_bytes = f.read()
	st.download_button(
	label="Download Excel file",
	data=excel_bytes,
	file_name=data_for_mapping,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.error("Error: Converted Excel file not found")


	st.markdown("## Upload the Data Master file with Item Code mapping")
	mapping_uploaded_file = st.file_uploader("Upload the Data Master file with Item Code mapping", type=["xlsx","ods"])

	if mapping_uploaded_file is not None:
	st.write("Uploaded Mapping Excel file:", mapping_uploaded_file.name)

	# Perform data mapping
	mapped_data = map_data_to_template(extracted_file, mapping_uploaded_file)

	# Provide a link to download the final Excel file after mapping
	st.markdown("### Final Excel File After Mapping")

	final_excel_file = 'Final Data.xlsx'
	mapped_data.to_excel(final_excel_file, index=False)

	if os.path.exists(final_excel_file):
	with open(final_excel_file, "rb") as f:
	excel_bytes = f.read()
	st.download_button(
	label="Download Excel file",
	data=excel_bytes,
	file_name=final_excel_file,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.error("Error: Converted Excel file not found")





	if __name__ == "__main__":
	main()