File size: 1,606 Bytes
57cf043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import logging
import zipfile
from pathlib import Path

logger = logging.getLogger(__name__)  

class DocxToXml:
    def __init__(self, docx_path: str):
        """
        Initialize the converter with path to DOCX file
        
        Args:
            docx_path (str): Path to the DOCX file
        """
        self.docx_path = Path(docx_path)
        if not self.docx_path.exists():
            raise FileNotFoundError(f"File not found: {docx_path}")
        
    def extract_document_xml(self) -> str:
        """
        Extract document.xml content from the DOCX file
        
        Returns:
            str: Content of document.xml file
        
        Raises:
            ValueError: If document.xml is not found in the DOCX file
        """
        try:
            with zipfile.ZipFile(self.docx_path) as docx_zip:
                # The main document content is always stored in word/document.xml
                xml_content = docx_zip.read('word/document.xml')
                return xml_content.decode('utf-8')
        except KeyError:
            raise ValueError("document.xml not found in the DOCX file")
        except Exception as e:
            raise Exception(f"Error extracting XML: {str(e)}")

    @staticmethod
    def convert_file(docx_path: str) -> str:
        """
        Static method to quickly convert a DOCX file to XML
        
        Args:
            docx_path (str): Path to the DOCX file
            
        Returns:
            str: Content of document.xml file
        """
        converter = DocxToXml(docx_path)
        return converter.extract_document_xml()