Spaces:
Runtime error
Runtime error
"""Loader that loads Facebook chat json dump.""" | |
import datetime | |
import json | |
from pathlib import Path | |
from typing import List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
def concatenate_rows(row: dict) -> str: | |
"""Combine message information in a readable format ready to be used.""" | |
sender = row["sender_name"] | |
text = row["content"] | |
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime( | |
"%Y-%m-%d %H:%M:%S" | |
) | |
return f"{sender} on {date}: {text}\n\n" | |
class FacebookChatLoader(BaseLoader): | |
"""Loader that loads Facebook messages json directory dump.""" | |
def __init__(self, path: str): | |
"""Initialize with path.""" | |
self.file_path = path | |
def load(self) -> List[Document]: | |
"""Load documents.""" | |
try: | |
import pandas as pd | |
except ImportError: | |
raise ValueError( | |
"pandas is needed for Facebook chat loader, " | |
"please install with `pip install pandas`" | |
) | |
p = Path(self.file_path) | |
with open(p, encoding="utf8") as f: | |
d = json.load(f) | |
normalized_messages = pd.json_normalize(d["messages"]) | |
df_normalized_messages = pd.DataFrame(normalized_messages) | |
# Only keep plain text messages | |
# (no services, nor links, hashtags, code, bold ...) | |
df_filtered = df_normalized_messages[ | |
(df_normalized_messages.content.apply(lambda x: type(x) == str)) | |
] | |
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]] | |
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") | |
metadata = {"source": str(p)} | |
return [Document(page_content=text, metadata=metadata)] | |