Spaces:
Sleeping
Sleeping
Create extract_text.py
Browse files- extract_text.py +18 -0
extract_text.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# extract_text.py
|
2 |
+
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
import json
|
5 |
+
|
6 |
+
def extract_text_from_pdf(pdf_path):
|
7 |
+
doc = fitz.open(pdf_path)
|
8 |
+
text = []
|
9 |
+
for page in doc:
|
10 |
+
text.append(page.get_text())
|
11 |
+
return text
|
12 |
+
|
13 |
+
if __name__ == "__main__":
|
14 |
+
pdf_text = extract_text_from_pdf("apexcustoms.pdf")
|
15 |
+
|
16 |
+
# Save the extracted text to a JSON file
|
17 |
+
with open("apexcustoms.json", "w") as f:
|
18 |
+
json.dump(pdf_text, f)
|