washeed
/

ocr

Model card Files Files and versions Community

washeed commited on May 2, 2024

Commit

e2ff69c

verified ·

1 Parent(s): b692870

Update docxtoimage.py

Browse files

Files changed (1) hide show

docxtoimage.py +47 -44

docxtoimage.py CHANGED Viewed

@@ -1,45 +1,48 @@
-import os
-from spire.doc import *
-from spire.doc.common import *
-def process(folder_path,max_page):
-    for filename in os.listdir(folder_path):
-        if filename.endswith(".docx"):
-            process_docx(folder_path, filename,max_page)
-def process_docx(folder_path, filename,max_page=None):
-  try:
-    # Construct the full file path
-    file_path = os.path.join(folder_path, filename)
-    # Process the docx file
-    document = Document()
-    document.LoadFromFile(file_path)
-    if max_page>document.GetPageCount():
-        image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
-    else:
-        image_streams = document.SaveImageToStreams(0,max_page ,ImageType.Bitmap)
-    # Extract the filename without extension
-    file_name, _ = os.path.splitext(filename)
-    # Create the folder path to save images
-    image_folder_path = os.path.join(folder_path, file_name)
-    os.makedirs(image_folder_path, exist_ok=True)
-    # Save each image stream to a JPG file
-    for i, image in enumerate(image_streams):
-        image_name = os.path.join(image_folder_path, f"{file_name}_{i+1}.png")
-        with open(image_name, 'wb') as image_file:
-            image_file.write(image.ToArray())
-    document.Close()
-  except Exception as e:
-    print(f"Error processing file {filename}: {e}")
-if __name__ == '__main__':
-    # Define the folder path
-    folder_path = "input"
-    max_page=4
     process(folder_path,max_page)

+import os
+from spire.doc import *
+from spire.doc.common import *
+def process(folder_path,max_page):
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".docx"):
+            process_docx(folder_path, filename,max_page)
+def process_docx(folder_path, filename,max_page):
+  try:
+    # Construct the full file path
+    file_path = os.path.join(folder_path, filename)
+    # Process the docx file
+    document = Document()
+    document.LoadFromFile(file_path)
+    if max_page is not None:
+        if max_page>document.GetPageCount():
+            image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
+        else:
+            image_streams = document.SaveImageToStreams(0,max_page ,ImageType.Bitmap)
+    if max_page is None:
+        max_page=document.GetPageCount
+        image_streams = document.SaveImageToStreams(0,document.GetPageCount() ,ImageType.Bitmap)
+    # Extract the filename without extension
+    file_name, _ = os.path.splitext(filename)
+    # Create the folder path to save images
+    image_folder_path = os.path.join(folder_path, file_name)
+    os.makedirs(image_folder_path, exist_ok=True)
+    # Save each image stream to a JPG file
+    for i, image in enumerate(image_streams):
+        image_name = os.path.join(image_folder_path, f"{file_name}_{i+1}.png")
+        with open(image_name, 'wb') as image_file:
+            image_file.write(image.ToArray())
+    document.Close()
+  except Exception as e:
+    print(f"Error processing file {filename}: {e}")
+if __name__ == '__main__':
+    # Define the folder path
+    folder_path = "input"
+    max_page=None
     process(folder_path,max_page)