Smart-Tasker / data_ingestion /preprocess_data.py
Shahid
Added first commit
af30a30
raw
history blame contribute delete
498 Bytes
# data_ingestion/preprocess_data.py
import re
def preprocess_text(data):
"""Cleans extracted text data to retain only the author and their message."""
cleaned_data = []
for line in data:
# Match pattern with author and text (e.g., "Author: Message")
match = re.match(r"^(.*?):\s+(.*)$", line)
if match:
author, text = match.groups()
cleaned_data.append({"author": author.strip(), "text": text.strip()})
return cleaned_data