gen-ai-project / extract_text.py
Moha782's picture
Create extract_text.py
e76edd3 verified
raw
history blame
416 Bytes
# extract_text.py
import fitz # PyMuPDF
import json
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = []
for page in doc:
text.append(page.get_text())
return text
if __name__ == "__main__":
pdf_text = extract_text_from_pdf("apexcustoms.pdf")
# Save the extracted text to a JSON file
with open("apexcustoms.json", "w") as f:
json.dump(pdf_text, f)