# data_ingestion/preprocess_data.py import re def preprocess_text(data): """Cleans extracted text data to retain only the author and their message.""" cleaned_data = [] for line in data: # Match pattern with author and text (e.g., "Author: Message") match = re.match(r"^(.*?):\s+(.*)$", line) if match: author, text = match.groups() cleaned_data.append({"author": author.strip(), "text": text.strip()}) return cleaned_data