File size: 498 Bytes
af30a30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# data_ingestion/preprocess_data.py
import re

def preprocess_text(data):
    """Cleans extracted text data to retain only the author and their message."""
    cleaned_data = []
    
    for line in data:
        # Match pattern with author and text (e.g., "Author: Message")
        match = re.match(r"^(.*?):\s+(.*)$", line)
        if match:
            author, text = match.groups()
            cleaned_data.append({"author": author.strip(), "text": text.strip()})
    
    return cleaned_data