connectwithprakash commited on
Commit
8fa2c1a
·
1 Parent(s): 81917a3

Add file_loader tool for loading various file types

Browse files
Files changed (1) hide show
  1. src/tools/file_loader.py +60 -0
src/tools/file_loader.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from typing import Any, Dict, List, Union
4
+ from langchain_community.document_loaders import (
5
+ TextLoader,
6
+ PyPDFLoader,
7
+ UnstructuredWordDocumentLoader,
8
+ UnstructuredPowerPointLoader,
9
+ )
10
+ from langchain_core.tools import tool
11
+
12
+
13
+ def load_file(file_path: str) -> str:
14
+ """Load a file as str.
15
+ Args:
16
+ file_path: Path to the file.
17
+ Returns:
18
+ str: Content of the file.
19
+ Raises:
20
+ ValueError: If file type is unsupported.
21
+ """
22
+ ext = os.path.splitext(file_path)[-1].lower()
23
+ if ext in (".csv", ".xlsx", ".xls"):
24
+ if ext == ".csv":
25
+ df = pd.read_csv(file_path)
26
+ else:
27
+ df = pd.read_excel(file_path)
28
+ return df
29
+ elif ext in (".txt", ".pdf", ".docx", ".doc", ".pptx", ".ppt"):
30
+ if ext == ".txt":
31
+ loader = TextLoader(file_path, encoding="utf8")
32
+ elif ext == ".pdf":
33
+ loader = PyPDFLoader(file_path)
34
+ elif ext in (".docx", ".doc"):
35
+ loader = UnstructuredWordDocumentLoader(file_path)
36
+ elif ext in (".pptx", ".ppt"):
37
+ loader = UnstructuredPowerPointLoader(file_path)
38
+ docs = loader.load()
39
+ if len(docs) > 0:
40
+ doc = docs[0]
41
+ return doc.page_content
42
+ else:
43
+ return "No content found in the file"
44
+ else:
45
+ raise ValueError(f"Unsupported file extension: {ext}")
46
+
47
+
48
+ @tool
49
+ def file_loader_tool(file_path: str) -> str:
50
+ """Loads a file (csv, xlsx, txt, pdf, docx, etc.) and returns its content as a string.
51
+ Args:
52
+ file_path (str): Path to the file to load.
53
+ """
54
+ doc = load_file(file_path)
55
+ return doc
56
+
57
+
58
+ if __name__ == "__main__":
59
+ print(file_loader_tool.invoke("7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"))
60
+ print(file_loader_tool.invoke("test.txt"))