|
import os |
|
import pandas as pd |
|
from typing import Any, Dict, List, Union |
|
from langchain_community.document_loaders import ( |
|
TextLoader, |
|
PyPDFLoader, |
|
UnstructuredWordDocumentLoader, |
|
UnstructuredPowerPointLoader, |
|
) |
|
from langchain_core.tools import tool |
|
|
|
|
|
def load_file(file_path: str) -> str: |
|
"""Load a file as str. |
|
Args: |
|
file_path: Path to the file. |
|
Returns: |
|
str: Content of the file. |
|
Raises: |
|
ValueError: If file type is unsupported. |
|
""" |
|
ext = os.path.splitext(file_path)[-1].lower() |
|
if ext in (".csv", ".xlsx", ".xls"): |
|
if ext == ".csv": |
|
df = pd.read_csv(file_path) |
|
else: |
|
df = pd.read_excel(file_path) |
|
return df |
|
elif ext in (".txt", ".pdf", ".docx", ".doc", ".pptx", ".ppt"): |
|
if ext == ".txt": |
|
loader = TextLoader(file_path, encoding="utf8") |
|
elif ext == ".pdf": |
|
loader = PyPDFLoader(file_path) |
|
elif ext in (".docx", ".doc"): |
|
loader = UnstructuredWordDocumentLoader(file_path) |
|
elif ext in (".pptx", ".ppt"): |
|
loader = UnstructuredPowerPointLoader(file_path) |
|
docs = loader.load() |
|
if len(docs) > 0: |
|
doc = docs[0] |
|
return doc.page_content |
|
else: |
|
return "No content found in the file" |
|
else: |
|
raise ValueError(f"Unsupported file extension: {ext}") |
|
|
|
|
|
@tool |
|
def file_loader_tool(file_path: str) -> str: |
|
"""Loads a file (csv, xlsx, txt, pdf, docx, etc.) and returns its content as a string. |
|
Args: |
|
file_path (str): Path to the file to load. |
|
""" |
|
doc = load_file(file_path) |
|
return doc |
|
|
|
|
|
if __name__ == "__main__": |
|
print(file_loader_tool.invoke("7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx")) |
|
print(file_loader_tool.invoke("test.txt")) |
|
|