dataextraction / app.py
hardik27's picture
Create app.py
f20a244 verified
raw
history blame
3.18 kB
import PyPDF2
import pandas as pd
import os
import streamlit as st
import pandas as pd
import tabula
def convert_pdf_to_excel(pdf_file):
# Use tabula to extract tables from PDF
inputpdf = PyPDF2.PdfReader(pdf_file)
pages_no = len(inputpdf.pages)
whole_data = []
for i in range(pages_no):
inputpdf = PyPDF2.PdfReader(pdf_file)
# output = PyPDF2.PdfWriter()
# output.add_page(inputpdf.pages[i])
pageObj = inputpdf.pages[i]
page_content = pageObj.extract_text()
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
data = each_table.split('\n')
each_table_data = []
for index in range(len(data)):
if data[index].strip() == 'Part No.':
each_table_data.append(data[index+1].replace('Part Color Code',""))
if 'Part Name' not in data[index+2]:
each_table_data.append(data[index+2].replace('Part Color Code',""))
else:
each_table_data.append("")
if 'Part Name' in data[index].strip():
each_table_data.append(data[index+1])
whole_data.append(each_table_data)
whole_data = pd.DataFrame(whole_data)
whole_data.columns = ["Part No.","Part Color Code","Part Name"]
# whole_data.to_csv("Extracted_Data.csv",index=False)
# Convert each table into a DataFrame
# dfs = []
# for table in tables:
# dfs.append(table)
# # Concatenate all DataFrames into a single DataFrame
# result = pd.concat(dfs)
# Convert DataFrame to Excel
# excel_file = pdf_file.name.replace('.pdf', '.xlsx')
# result.to_excel(excel_file, index=False)
excel_file = pdf_file.name.replace('.pdf', '.xlsx')
whole_data.to_excel(excel_file, index=False)
return excel_file
# whole_data.to_csv(excel_file,index=False)
# return excel_file
def main():
st.title("PDF to Excel Converter")
# File uploader
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
st.write("Uploaded PDF file:", uploaded_file.name)
# Convert PDF to Excel
excel_file = convert_pdf_to_excel(uploaded_file)
# Download link for the Excel file
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
if os.path.exists(excel_file):
with open(excel_file, "rb") as f:
excel_bytes = f.read()
st.download_button(
label="Download Excel file",
data=excel_bytes,
file_name=excel_file,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.error("Error: Converted Excel file not found")
if __name__ == "__main__":
main()
# file_name = input("Give Complete file location")
# file_name = '/home/hardik/Downloads/data extraction/HSCI.2.20231121154327.WG.IFORD001.0492.4348.5M09-01.pdf'
# pdf_in_file = open(file_name,'rb')