Spaces:
Running
Running
import PyPDF2 | |
import pandas as pd | |
import os | |
import streamlit as st | |
import pandas as pd | |
import tabula | |
def convert_pdf_to_excel(pdf_file): | |
# Use tabula to extract tables from PDF | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
pages_no = len(inputpdf.pages) | |
whole_data = [] | |
for i in range(pages_no): | |
inputpdf = PyPDF2.PdfReader(pdf_file) | |
# output = PyPDF2.PdfWriter() | |
# output.add_page(inputpdf.pages[i]) | |
pageObj = inputpdf.pages[i] | |
page_content = pageObj.extract_text() | |
for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]: | |
data = each_table.split('\n') | |
each_table_data = [] | |
for index in range(len(data)): | |
if data[index].strip() == 'Part No.': | |
each_table_data.append(data[index+1].replace('Part Color Code',"")) | |
if 'Part Name' not in data[index+2]: | |
each_table_data.append(data[index+2].replace('Part Color Code',"")) | |
else: | |
each_table_data.append("") | |
if 'Part Name' in data[index].strip(): | |
each_table_data.append(data[index+1]) | |
whole_data.append(each_table_data) | |
whole_data = pd.DataFrame(whole_data) | |
whole_data.columns = ["Part No.","Part Color Code","Part Name"] | |
# whole_data.to_csv("Extracted_Data.csv",index=False) | |
# Convert each table into a DataFrame | |
# dfs = [] | |
# for table in tables: | |
# dfs.append(table) | |
# # Concatenate all DataFrames into a single DataFrame | |
# result = pd.concat(dfs) | |
# Convert DataFrame to Excel | |
# excel_file = pdf_file.name.replace('.pdf', '.xlsx') | |
# result.to_excel(excel_file, index=False) | |
excel_file = pdf_file.name.replace('.pdf', '.xlsx') | |
whole_data.to_excel(excel_file, index=False) | |
return excel_file | |
# whole_data.to_csv(excel_file,index=False) | |
# return excel_file | |
def main(): | |
st.title("PDF to Excel Converter") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
st.write("Uploaded PDF file:", uploaded_file.name) | |
# Convert PDF to Excel | |
excel_file = convert_pdf_to_excel(uploaded_file) | |
# Download link for the Excel file | |
# st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})") | |
if os.path.exists(excel_file): | |
with open(excel_file, "rb") as f: | |
excel_bytes = f.read() | |
st.download_button( | |
label="Download Excel file", | |
data=excel_bytes, | |
file_name=excel_file, | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("Error: Converted Excel file not found") | |
if __name__ == "__main__": | |
main() | |
# file_name = input("Give Complete file location") | |
# file_name = '/home/hardik/Downloads/data extraction/HSCI.2.20231121154327.WG.IFORD001.0492.4348.5M09-01.pdf' | |
# pdf_in_file = open(file_name,'rb') |