Moha782 commited on
Commit
e76edd3
·
verified ·
1 Parent(s): 51f29de

Create extract_text.py

Browse files
Files changed (1) hide show
  1. extract_text.py +18 -0
extract_text.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extract_text.py
2
+
3
+ import fitz # PyMuPDF
4
+ import json
5
+
6
+ def extract_text_from_pdf(pdf_path):
7
+ doc = fitz.open(pdf_path)
8
+ text = []
9
+ for page in doc:
10
+ text.append(page.get_text())
11
+ return text
12
+
13
+ if __name__ == "__main__":
14
+ pdf_text = extract_text_from_pdf("apexcustoms.pdf")
15
+
16
+ # Save the extracted text to a JSON file
17
+ with open("apexcustoms.json", "w") as f:
18
+ json.dump(pdf_text, f)