File size: 390 Bytes
af30a30
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# data_ingestion/ingest_data.py
from docx import Document

def read_document(file_path):
    """Reads a Word document and extracts text content from each line."""
    document = Document(file_path)
    text_data = []

    for para in document.paragraphs:
        line = para.text.strip()
        if line:  # Only add non-empty lines
            text_data.append(line)

    return text_data