hardik27 commited on
Commit
f20a244
·
verified ·
1 Parent(s): f2ce0d3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import pandas as pd
3
+ import os
4
+
5
+ import streamlit as st
6
+ import pandas as pd
7
+ import tabula
8
+
9
+ def convert_pdf_to_excel(pdf_file):
10
+ # Use tabula to extract tables from PDF
11
+ inputpdf = PyPDF2.PdfReader(pdf_file)
12
+ pages_no = len(inputpdf.pages)
13
+ whole_data = []
14
+ for i in range(pages_no):
15
+ inputpdf = PyPDF2.PdfReader(pdf_file)
16
+ # output = PyPDF2.PdfWriter()
17
+ # output.add_page(inputpdf.pages[i])
18
+ pageObj = inputpdf.pages[i]
19
+ page_content = pageObj.extract_text()
20
+ for each_table in [i for i in page_content.split('Delivery Schedule Sheet') if i]:
21
+ data = each_table.split('\n')
22
+ each_table_data = []
23
+ for index in range(len(data)):
24
+ if data[index].strip() == 'Part No.':
25
+ each_table_data.append(data[index+1].replace('Part Color Code',""))
26
+ if 'Part Name' not in data[index+2]:
27
+ each_table_data.append(data[index+2].replace('Part Color Code',""))
28
+ else:
29
+ each_table_data.append("")
30
+
31
+ if 'Part Name' in data[index].strip():
32
+ each_table_data.append(data[index+1])
33
+ whole_data.append(each_table_data)
34
+
35
+ whole_data = pd.DataFrame(whole_data)
36
+ whole_data.columns = ["Part No.","Part Color Code","Part Name"]
37
+ # whole_data.to_csv("Extracted_Data.csv",index=False)
38
+
39
+ # Convert each table into a DataFrame
40
+ # dfs = []
41
+ # for table in tables:
42
+ # dfs.append(table)
43
+
44
+ # # Concatenate all DataFrames into a single DataFrame
45
+ # result = pd.concat(dfs)
46
+
47
+ # Convert DataFrame to Excel
48
+ # excel_file = pdf_file.name.replace('.pdf', '.xlsx')
49
+ # result.to_excel(excel_file, index=False)
50
+ excel_file = pdf_file.name.replace('.pdf', '.xlsx')
51
+ whole_data.to_excel(excel_file, index=False)
52
+
53
+ return excel_file
54
+
55
+
56
+ # whole_data.to_csv(excel_file,index=False)
57
+
58
+ # return excel_file
59
+
60
+ def main():
61
+ st.title("PDF to Excel Converter")
62
+
63
+ # File uploader
64
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
65
+
66
+ if uploaded_file is not None:
67
+ st.write("Uploaded PDF file:", uploaded_file.name)
68
+
69
+ # Convert PDF to Excel
70
+ excel_file = convert_pdf_to_excel(uploaded_file)
71
+
72
+ # Download link for the Excel file
73
+ # st.markdown(f"Download the extracted data in Excel file [here](/{excel_file})")
74
+
75
+ if os.path.exists(excel_file):
76
+ with open(excel_file, "rb") as f:
77
+ excel_bytes = f.read()
78
+ st.download_button(
79
+ label="Download Excel file",
80
+ data=excel_bytes,
81
+ file_name=excel_file,
82
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
83
+ )
84
+ else:
85
+ st.error("Error: Converted Excel file not found")
86
+
87
+ if __name__ == "__main__":
88
+ main()
89
+
90
+ # file_name = input("Give Complete file location")
91
+
92
+ # file_name = '/home/hardik/Downloads/data extraction/HSCI.2.20231121154327.WG.IFORD001.0492.4348.5M09-01.pdf'
93
+ # pdf_in_file = open(file_name,'rb')