Spaces:
Sleeping
Sleeping
# data_ingestion/preprocess_data.py | |
import re | |
def preprocess_text(data): | |
"""Cleans extracted text data to retain only the author and their message.""" | |
cleaned_data = [] | |
for line in data: | |
# Match pattern with author and text (e.g., "Author: Message") | |
match = re.match(r"^(.*?):\s+(.*)$", line) | |
if match: | |
author, text = match.groups() | |
cleaned_data.append({"author": author.strip(), "text": text.strip()}) | |
return cleaned_data | |