Spaces:

hardik27
/

dataextraction

Running

File size: 3,184 Bytes

f20a244

import PyPDF2
import pandas as pd
import os

import streamlit as st
import pandas as pd
import tabula

def convert_pdf_to_excel(pdf_file):
    # Use tabula to extract tables from PDF
    inputpdf = PyPDF2.PdfReader(pdf_file)
    pages_no = len(inputpdf.pages)
    whole_data = []
    for i in range(pages_no):
        inputpdf = PyPDF2.PdfReader(pdf_file)
        # output = PyPDF2.PdfWriter()
        # output.add_page(inputpdf.pages[i])
        pageObj = inputpdf.pages[i]
        page_content = pageObj.extract_text()
        for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
            data = each_table.split('\n')
            each_table_data = []
            for index in range(len(data)):
                if data[index].strip() == 'Part No.':
                    each_table_data.append(data[index+1].replace('Part Color Code',""))
                    if 'Part Name' not in data[index+2]:
                        each_table_data.append(data[index+2].replace('Part Color Code',""))
                    else:
                        each_table_data.append("")

                if 'Part Name' in data[index].strip():
                    each_table_data.append(data[index+1])
            whole_data.append(each_table_data)

    whole_data = pd.DataFrame(whole_data)
    whole_data.columns = ["Part No.","Part Color Code","Part Name"]
    # whole_data.to_csv("Extracted_Data.csv",index=False) 
    
    # Convert each table into a DataFrame
    # dfs = []
    # for table in tables:
    #     dfs.append(table)
    
    # # Concatenate all DataFrames into a single DataFrame
    # result = pd.concat(dfs)
    
    # Convert DataFrame to Excel
    # excel_file = pdf_file.name.replace('.pdf', '.xlsx')
    # result.to_excel(excel_file, index=False)
    excel_file = pdf_file.name.replace('.pdf', '.xlsx')
    whole_data.to_excel(excel_file, index=False)
    
    return excel_file


    # whole_data.to_csv(excel_file,index=False) 
    
    # return excel_file

def main():
    st.title("PDF to Excel Converter")
    
    # File uploader
    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
    
    if uploaded_file is not None:
        st.write("Uploaded PDF file:", uploaded_file.name)
        
        # Convert PDF to Excel
        excel_file = convert_pdf_to_excel(uploaded_file)
        
        # Download link for the Excel file
        # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")

        if os.path.exists(excel_file):
            with open(excel_file, "rb") as f:
                excel_bytes = f.read()
            st.download_button(
                label="Download Excel file",
                data=excel_bytes,
                file_name=excel_file,
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.error("Error: Converted Excel file not found")
        
if __name__ == "__main__":
    main()

# file_name = input("Give Complete file location")

# file_name = '/home/hardik/Downloads/data extraction/HSCI.2.20231121154327.WG.IFORD001.0492.4348.5M09-01.pdf'
# pdf_in_file = open(file_name,'rb')