NEXAS commited on
Commit
c15a7dc
·
verified ·
1 Parent(s): 91c7e66

Update src/utils/ingest_text.py

Browse files
Files changed (1) hide show
  1. src/utils/ingest_text.py +115 -115
src/utils/ingest_text.py CHANGED
@@ -1,116 +1,116 @@
1
- from llama_parse import LlamaParse
2
- from langchain_chroma import Chroma
3
- from qdrant_client import QdrantClient
4
- from langchain_community.vectorstores.qdrant import Qdrant
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
- from langchain_community.document_loaders.directory import DirectoryLoader
8
- import os
9
- from fastembed import TextEmbedding
10
- from typing import List
11
-
12
- import nest_asyncio
13
- nest_asyncio.apply()
14
-
15
- llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
16
- #qdrant_url = os.getenv("QDRANT_URL ")
17
- #qdrant_api_key = os.getenv("QDRANT_API_KEY")
18
- groq_api_key = os.getenv("GROQ_API_KEY")
19
-
20
-
21
- parsed_data_file = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\parsed_data.pkl"
22
- output_md = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data\output.md"
23
- loki = r"C:\Users\Naresh Kumar Lahajal\Desktop\multimodal\data"
24
-
25
- import pickle
26
- # Define a function to load parsed data if available, or parse if not
27
- def load_or_parse_data(loc):
28
- data_file = parsed_data_file
29
-
30
- if os.path.exists(data_file):
31
- # Load the parsed data from the file
32
- with open(data_file, "rb") as f:
33
- parsed_data = pickle.load(f)
34
- else:
35
- # Perform the parsing step and store the result in llama_parse_documents
36
- parsingInstructiontest10k = """The provided document is an user guide or a manual.
37
- It contains many images and tables.
38
- Try to be precise while answering the questions"""
39
- parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
40
- llama_parse_documents = parser.load_data(loc)
41
-
42
-
43
- # Save the parsed data to a file
44
- with open(data_file, "wb") as f:
45
- pickle.dump(llama_parse_documents, f)
46
-
47
- # Set the parsed data to the variable
48
- parsed_data = llama_parse_documents
49
-
50
- return parsed_data
51
-
52
-
53
- # Create vector database
54
- def create_vector_database(loc):
55
- """
56
- Creates a vector database using document loaders and embeddings.
57
-
58
- This function loads urls,
59
- splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
60
- and finally persists the embeddings into a Chroma vector database.
61
-
62
- """
63
- # Call the function to either load or parse the data
64
- llama_parse_documents = load_or_parse_data(loc)
65
- #print(llama_parse_documents[1].text[:100])
66
-
67
- #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
68
- # for doc in llama_parse_documents:
69
- # f.write(doc.text + '\n')
70
- with open(output_md,'a', encoding='utf-8') as f: # Open the file in append mode ('a')
71
- for doc in llama_parse_documents:
72
- f.write(doc.text + '\n')
73
-
74
- loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
75
- documents = loader.load()
76
- # Split loaded documents into chunks
77
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
78
- print('data chunckex')
79
- docs = text_splitter.split_documents(documents)
80
- print(len(docs))
81
-
82
- #len(docs)
83
- #docs[0]
84
-
85
- # Initialize Embeddings
86
- embeddings = FastEmbedEmbeddings() # type: ignore
87
- #embeddings = TextEmbedding()
88
-
89
- print('Vector DB started!')
90
-
91
- # Create and persist a Chroma vector database from the chunked documents
92
- qdrant = Qdrant.from_documents(
93
- documents=docs,
94
- embedding=embeddings,
95
- path="local_qdrant",
96
- #url=qdrant_url,
97
- collection_name="rag"
98
- #api_key=qdrant_api_key
99
- )
100
- # save to disk
101
- #db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
102
- #docs = db2.similarity_search(query)
103
-
104
- # load from disk
105
- #db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
106
-
107
- #query it
108
- #query = "what is the agend of Financial Statements for 2022 ?"
109
- #found_doc = qdrant.similarity_search(query, k=3)
110
- #print(found_doc[0][:100])
111
- #
112
- print('Vector DB created successfully !')
113
- #query = "Switching between external devices connected to the TV"
114
- #found_doc = qdrant.similarity_search(query, k=3)
115
- #print(found_doc)
116
  return qdrant
 
1
+ from llama_parse import LlamaParse
2
+ from langchain_chroma import Chroma
3
+ from qdrant_client import QdrantClient
4
+ from langchain_community.vectorstores.qdrant import Qdrant
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
+ from langchain_community.document_loaders.directory import DirectoryLoader
8
+ import os
9
+ from fastembed import TextEmbedding
10
+ from typing import List
11
+
12
+ import nest_asyncio
13
+ nest_asyncio.apply()
14
+
15
+ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
16
+ #qdrant_url = os.getenv("QDRANT_URL ")
17
+ #qdrant_api_key = os.getenv("QDRANT_API_KEY")
18
+ groq_api_key = os.getenv("GROQ_API_KEY")
19
+
20
+
21
+ parsed_data_file = r".\data\parsed_data.pkl"
22
+ output_md = r".\data\output.md"
23
+ loki = r".\data"
24
+
25
+ import pickle
26
+ # Define a function to load parsed data if available, or parse if not
27
+ def load_or_parse_data(loc):
28
+ data_file = parsed_data_file
29
+
30
+ if os.path.exists(data_file):
31
+ # Load the parsed data from the file
32
+ with open(data_file, "rb") as f:
33
+ parsed_data = pickle.load(f)
34
+ else:
35
+ # Perform the parsing step and store the result in llama_parse_documents
36
+ parsingInstructiontest10k = """The provided document is an user guide or a manual.
37
+ It contains many images and tables.
38
+ Try to be precise while answering the questions"""
39
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
40
+ llama_parse_documents = parser.load_data(loc)
41
+
42
+
43
+ # Save the parsed data to a file
44
+ with open(data_file, "wb") as f:
45
+ pickle.dump(llama_parse_documents, f)
46
+
47
+ # Set the parsed data to the variable
48
+ parsed_data = llama_parse_documents
49
+
50
+ return parsed_data
51
+
52
+
53
+ # Create vector database
54
+ def create_vector_database(loc):
55
+ """
56
+ Creates a vector database using document loaders and embeddings.
57
+
58
+ This function loads urls,
59
+ splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
60
+ and finally persists the embeddings into a Chroma vector database.
61
+
62
+ """
63
+ # Call the function to either load or parse the data
64
+ llama_parse_documents = load_or_parse_data(loc)
65
+ #print(llama_parse_documents[1].text[:100])
66
+
67
+ #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
68
+ # for doc in llama_parse_documents:
69
+ # f.write(doc.text + '\n')
70
+ with open(output_md,'a', encoding='utf-8') as f: # Open the file in append mode ('a')
71
+ for doc in llama_parse_documents:
72
+ f.write(doc.text + '\n')
73
+
74
+ loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
75
+ documents = loader.load()
76
+ # Split loaded documents into chunks
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
78
+ print('data chunckex')
79
+ docs = text_splitter.split_documents(documents)
80
+ print(len(docs))
81
+
82
+ #len(docs)
83
+ #docs[0]
84
+
85
+ # Initialize Embeddings
86
+ embeddings = FastEmbedEmbeddings() # type: ignore
87
+ #embeddings = TextEmbedding()
88
+
89
+ print('Vector DB started!')
90
+
91
+ # Create and persist a Chroma vector database from the chunked documents
92
+ qdrant = Qdrant.from_documents(
93
+ documents=docs,
94
+ embedding=embeddings,
95
+ path="local_qdrant",
96
+ #url=qdrant_url,
97
+ collection_name="rag"
98
+ #api_key=qdrant_api_key
99
+ )
100
+ # save to disk
101
+ #db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
102
+ #docs = db2.similarity_search(query)
103
+
104
+ # load from disk
105
+ #db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
106
+
107
+ #query it
108
+ #query = "what is the agend of Financial Statements for 2022 ?"
109
+ #found_doc = qdrant.similarity_search(query, k=3)
110
+ #print(found_doc[0][:100])
111
+ #
112
+ print('Vector DB created successfully !')
113
+ #query = "Switching between external devices connected to the TV"
114
+ #found_doc = qdrant.similarity_search(query, k=3)
115
+ #print(found_doc)
116
  return qdrant