JaganathC commited on
Commit
3ddcd56
·
verified ·
1 Parent(s): be8fa27

Update indexing.py

Browse files
Files changed (1) hide show
  1. indexing.py +18 -27
indexing.py CHANGED
@@ -4,54 +4,50 @@ Indexing with vector database
4
 
5
  from pathlib import Path
6
  import re
7
-
8
  import chromadb
9
-
10
  from unidecode import unidecode
11
 
12
- from langchain_community.document_loaders import PyPDFLoader
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain_chroma import Chroma
15
  from langchain_huggingface import HuggingFaceEmbeddings
16
 
17
 
18
-
19
- # Load PDF document and create doc splits
20
  def load_doc(list_file_path, chunk_size, chunk_overlap):
21
- """Load PDF document and create doc splits"""
22
 
23
- loaders = [PyPDFLoader(x) for x in list_file_path]
24
  pages = []
25
- for loader in loaders:
26
- pages.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
27
  text_splitter = RecursiveCharacterTextSplitter(
28
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
 
29
  )
30
  doc_splits = text_splitter.split_documents(pages)
31
- return doc_splits
32
 
33
 
34
  # Generate collection name for vector database
35
- # - Use filepath as input, ensuring unicode text
36
- # - Handle multiple languages (arabic, chinese)
37
  def create_collection_name(filepath):
38
  """Create collection name for vector database"""
39
-
40
- # Extract filename without extension
41
  collection_name = Path(filepath).stem
42
- # Fix potential issues from naming convention
43
- ## Remove space
44
  collection_name = collection_name.replace(" ", "-")
45
- ## ASCII transliterations of Unicode text
46
  collection_name = unidecode(collection_name)
47
- ## Remove special characters
48
  collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name)
49
- ## Limit length to 50 characters
50
  collection_name = collection_name[:50]
51
- ## Minimum length of 3 characters
52
  if len(collection_name) < 3:
53
  collection_name = collection_name + "xyz"
54
- ## Enforce start and end as alphanumeric character
55
  if not collection_name[0].isalnum():
56
  collection_name = "A" + collection_name[1:]
57
  if not collection_name[-1].isalnum():
@@ -64,12 +60,8 @@ def create_collection_name(filepath):
64
  # Create vector database
65
  def create_db(splits, collection_name):
66
  """Create embeddings and vector database"""
67
-
68
  embedding = HuggingFaceEmbeddings(
69
  model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
70
- # model_name="sentence-transformers/all-MiniLM-L6-v2",
71
- # model_kwargs={"device": "cpu"},
72
- # encode_kwargs={'normalize_embeddings': False}
73
  )
74
  chromadb.api.client.SharedSystemClient.clear_system_cache()
75
  new_client = chromadb.EphemeralClient()
@@ -78,6 +70,5 @@ def create_db(splits, collection_name):
78
  embedding=embedding,
79
  client=new_client,
80
  collection_name=collection_name,
81
- # persist_directory=default_persist_directory
82
  )
83
  return vectordb
 
4
 
5
  from pathlib import Path
6
  import re
 
7
  import chromadb
 
8
  from unidecode import unidecode
9
 
10
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain_chroma import Chroma
13
  from langchain_huggingface import HuggingFaceEmbeddings
14
 
15
 
16
+ # Load PDF or TXT document and create doc splits
 
17
  def load_doc(list_file_path, chunk_size, chunk_overlap):
18
+ """Load documents and create doc splits"""
19
 
 
20
  pages = []
21
+ full_text = ""
22
+ for path in list_file_path:
23
+ if path.endswith(".pdf"):
24
+ loader = PyPDFLoader(path)
25
+ elif path.endswith(".txt"):
26
+ loader = TextLoader(path)
27
+ else:
28
+ continue
29
+ doc_pages = loader.load()
30
+ pages.extend(doc_pages)
31
+ full_text += "\n".join([p.page_content for p in doc_pages]) + "\n"
32
+
33
  text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size=chunk_size,
35
+ chunk_overlap=chunk_overlap,
36
  )
37
  doc_splits = text_splitter.split_documents(pages)
38
+ return doc_splits, full_text
39
 
40
 
41
  # Generate collection name for vector database
 
 
42
  def create_collection_name(filepath):
43
  """Create collection name for vector database"""
 
 
44
  collection_name = Path(filepath).stem
 
 
45
  collection_name = collection_name.replace(" ", "-")
 
46
  collection_name = unidecode(collection_name)
 
47
  collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name)
 
48
  collection_name = collection_name[:50]
 
49
  if len(collection_name) < 3:
50
  collection_name = collection_name + "xyz"
 
51
  if not collection_name[0].isalnum():
52
  collection_name = "A" + collection_name[1:]
53
  if not collection_name[-1].isalnum():
 
60
  # Create vector database
61
  def create_db(splits, collection_name):
62
  """Create embeddings and vector database"""
 
63
  embedding = HuggingFaceEmbeddings(
64
  model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
 
 
 
65
  )
66
  chromadb.api.client.SharedSystemClient.clear_system_cache()
67
  new_client = chromadb.EphemeralClient()
 
70
  embedding=embedding,
71
  client=new_client,
72
  collection_name=collection_name,
 
73
  )
74
  return vectordb