ocr / docxtoimage.py
washeed's picture
Update docxtoimage.py
e2ff69c verified
import os
from spire.doc import *
from spire.doc.common import *
def process(folder_path,max_page):
for filename in os.listdir(folder_path):
if filename.endswith(".docx"):
process_docx(folder_path, filename,max_page)
def process_docx(folder_path, filename,max_page):
try:
# Construct the full file path
file_path = os.path.join(folder_path, filename)
# Process the docx file
document = Document()
document.LoadFromFile(file_path)
if max_page is not None:
if max_page>document.GetPageCount():
image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
else:
image_streams = document.SaveImageToStreams(0,max_page ,ImageType.Bitmap)
if max_page is None:
max_page=document.GetPageCount
image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
# Extract the filename without extension
file_name, _ = os.path.splitext(filename)
# Create the folder path to save images
image_folder_path = os.path.join(folder_path, file_name)
os.makedirs(image_folder_path, exist_ok=True)
# Save each image stream to a JPG file
for i, image in enumerate(image_streams):
image_name = os.path.join(image_folder_path, f"{file_name}_{i+1}.png")
with open(image_name, 'wb') as image_file:
image_file.write(image.ToArray())
document.Close()
except Exception as e:
print(f"Error processing file {filename}: {e}")
if __name__ == '__main__':
# Define the folder path
folder_path = "input"
max_page=None
process(folder_path,max_page)