Spaces:
Sleeping
Sleeping
File size: 498 Bytes
af30a30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
# data_ingestion/preprocess_data.py
import re
def preprocess_text(data):
"""Cleans extracted text data to retain only the author and their message."""
cleaned_data = []
for line in data:
# Match pattern with author and text (e.g., "Author: Message")
match = re.match(r"^(.*?):\s+(.*)$", line)
if match:
author, text = match.groups()
cleaned_data.append({"author": author.strip(), "text": text.strip()})
return cleaned_data
|