Spaces:
Runtime error
Runtime error
File size: 2,611 Bytes
9c48ae2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2023/6/8 14:03
@Author : alexanderwu
@File : https://github.com/geekan/MetaGPT/blob/main/metagpt/document_store/document.py
"""
from pathlib import Path
import pandas as pd
from langchain.document_loaders import (
TextLoader,
UnstructuredPDFLoader,
UnstructuredWordDocumentLoader,
)
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm
def validate_cols(content_col: str, df: pd.DataFrame):
if content_col not in df.columns:
raise ValueError
def read_data(data_path: Path):
suffix = data_path.suffix
if '.xlsx' == suffix:
data = pd.read_excel(data_path)
elif '.csv' == suffix:
data = pd.read_csv(data_path)
elif '.json' == suffix:
data = pd.read_json(data_path)
elif suffix in ('.docx', '.doc'):
data = UnstructuredWordDocumentLoader(str(data_path), mode='elements').load()
elif '.txt' == suffix:
data = TextLoader(str(data_path)).load()
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=256, chunk_overlap=0)
texts = text_splitter.split_documents(data)
data = texts
elif '.pdf' == suffix:
data = UnstructuredPDFLoader(str(data_path), mode="elements").load()
else:
raise NotImplementedError
return data
class Document:
def __init__(self, data_path, content_col='content', meta_col='metadata'):
self.data = read_data(data_path)
if isinstance(self.data, pd.DataFrame):
validate_cols(content_col, self.data)
self.content_col = content_col
self.meta_col = meta_col
def _get_docs_and_metadatas_by_df(self) -> (list, list):
df = self.data
docs = []
metadatas = []
for i in tqdm(range(len(df))):
docs.append(df[self.content_col].iloc[i])
if self.meta_col:
metadatas.append({self.meta_col: df[self.meta_col].iloc[i]})
else:
metadatas.append({})
return docs, metadatas
def _get_docs_and_metadatas_by_langchain(self) -> (list, list):
data = self.data
docs = [i.page_content for i in data]
metadatas = [i.metadata for i in data]
return docs, metadatas
def get_docs_and_metadatas(self) -> (list, list):
if isinstance(self.data, pd.DataFrame):
return self._get_docs_and_metadatas_by_df()
elif isinstance(self.data, list):
return self._get_docs_and_metadatas_by_langchain()
else:
raise NotImplementedError
|