Spaces:
Runtime error
Runtime error
File size: 1,816 Bytes
58d33f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
"""Loader that loads Facebook chat json dump."""
import datetime
import json
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used."""
sender = row["sender_name"]
text = row["content"]
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
"%Y-%m-%d %H:%M:%S"
)
return f"{sender} on {date}: {text}\n\n"
class FacebookChatLoader(BaseLoader):
"""Loader that loads Facebook messages json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
try:
import pandas as pd
except ImportError:
raise ValueError(
"pandas is needed for Facebook chat loader, "
"please install with `pip install pandas`"
)
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d["messages"])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages
# (no services, nor links, hashtags, code, bold ...)
df_filtered = df_normalized_messages[
(df_normalized_messages.content.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]
|