NEXAS commited on
Commit
cb7d229
·
verified ·
1 Parent(s): ab33eeb

Update src/utils/ingest_text.py

Browse files
Files changed (1) hide show
  1. src/utils/ingest_text.py +66 -90
src/utils/ingest_text.py CHANGED
@@ -1,120 +1,96 @@
 
 
 
1
  from llama_parse import LlamaParse
2
- from langchain_chroma import Chroma
3
- from qdrant_client import QdrantClient
4
- from langchain_community.vectorstores.qdrant import Qdrant
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
7
  from langchain_community.document_loaders.directory import DirectoryLoader
8
- import os
9
- from fastembed import TextEmbedding
10
- from typing import List
11
  import nltk
12
- nltk.download('punkt')
13
-
14
-
15
  import nest_asyncio
 
 
 
16
  nest_asyncio.apply()
17
 
 
 
 
 
 
18
  llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
19
- #qdrant_url = os.getenv("QDRANT_URL ")
20
- #qdrant_api_key = os.getenv("QDRANT_API_KEY")
21
  groq_api_key = os.getenv("GROQ_API_KEY")
22
 
 
 
 
 
 
23
 
24
- parsed_data_file = r"./data/parsed_data.pkl"
25
- output_md = r"./data/output.md"
26
- loki = r"data"
27
-
28
- import pickle
29
- # Define a function to load parsed data if available, or parse if not
30
- def load_or_parse_data(loc):
31
- data_file = parsed_data_file
32
-
33
- if os.path.exists(data_file):
34
- # Load the parsed data from the file
35
- with open(data_file, "rb") as f:
36
  parsed_data = pickle.load(f)
37
  else:
38
- # Perform the parsing step and store the result in llama_parse_documents
39
- parsingInstructiontest10k = """The provided document is an user guide or a manual.
40
- It contains many images and tables.
41
- Try to be precise while answering the questions"""
42
- parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore
43
- llama_parse_documents = parser.load_data(loc)
44
-
45
 
46
- # Save the parsed data to a file
47
- with open(data_file, "wb") as f:
48
- pickle.dump(llama_parse_documents, f)
49
-
50
- # Set the parsed data to the variable
51
- parsed_data = llama_parse_documents
52
 
53
  return parsed_data
54
 
 
 
 
55
 
56
- # Create vector database
57
- def create_vector_database(loc):
58
- """
59
- Creates a vector database using document loaders and embeddings.
60
 
61
- This function loads urls,
62
- splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
63
- and finally persists the embeddings into a Chroma vector database.
64
 
65
- """
66
- # Call the function to either load or parse the data
67
- print("text_db")
68
- llama_parse_documents = load_or_parse_data(loc)
69
- #print(llama_parse_documents[1].text[:100])
70
 
71
- #with open('data/output.md', 'a') as f: # Open the file in append mode ('a')
72
- # for doc in llama_parse_documents:
73
- # f.write(doc.text + '\n')
74
- with open(output_md,'a', encoding='utf-8') as f: # Open the file in append mode ('a')
75
- for doc in llama_parse_documents:
76
- f.write(doc.text + '\n')
77
 
78
- loader = DirectoryLoader(loki, glob="**/*.md", show_progress=True)
79
- documents = loader.load()
80
- # Split loaded documents into chunks
81
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
82
- print('data chunckex')
83
- docs = text_splitter.split_documents(documents)
84
- print(len(docs))
85
 
86
- #len(docs)
87
- #docs[0]
88
 
89
- # Initialize Embeddings
90
- embeddings = FastEmbedEmbeddings() # type: ignore
91
- #embeddings = TextEmbedding()
 
92
 
93
- print('Vector DB started!')
 
94
 
95
- # Create and persist a Chroma vector database from the chunked documents
 
96
  qdrant = Qdrant.from_documents(
97
  documents=docs,
98
  embedding=embeddings,
99
- path=r".\data\local_qdrant",
100
- #url=qdrant_url,
101
- collection_name="rag"
102
- #api_key=qdrant_api_key
103
  )
104
- # save to disk
105
- #db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
106
- #docs = db2.similarity_search(query)
107
-
108
- # load from disk
109
- #db3 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
110
-
111
- #query it
112
- #query = "what is the agend of Financial Statements for 2022 ?"
113
- #found_doc = qdrant.similarity_search(query, k=3)
114
- #print(found_doc[0][:100])
115
- #
116
- print('Vector DB created successfully !')
117
- #query = "Switching between external devices connected to the TV"
118
- #found_doc = qdrant.similarity_search(query, k=3)
119
- #print(found_doc)
120
- return qdrant
 
1
+ import os
2
+ import pickle
3
+ from typing import List
4
  from llama_parse import LlamaParse
 
 
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
6
  from langchain_community.document_loaders.directory import DirectoryLoader
7
+ from langchain_community.document_loaders import TextLoader
8
+ from langchain_community.vectorstores.qdrant import Qdrant
9
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
10
  import nltk
 
 
 
11
  import nest_asyncio
12
+
13
+ # Setup
14
+ nltk.download('punkt')
15
  nest_asyncio.apply()
16
 
17
+ # Load environment variables
18
+ from dotenv import load_dotenv
19
+ load_dotenv()
20
+
21
+ # Environment keys
22
  llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
 
 
23
  groq_api_key = os.getenv("GROQ_API_KEY")
24
 
25
+ # Paths
26
+ parsed_data_file = os.path.join("data", "parsed_data.pkl")
27
+ output_md = os.path.join("data", "output.md")
28
+ md_directory = "data"
29
+ collection_name = "rag"
30
 
31
+ # Helper: Load or parse PDF
32
+ def load_or_parse_data(pdf_path):
33
+ if os.path.exists(parsed_data_file):
34
+ with open(parsed_data_file, "rb") as f:
 
 
 
 
 
 
 
 
35
  parsed_data = pickle.load(f)
36
  else:
37
+ parsing_instruction = """The provided document is a user guide or manual.
38
+ It contains many images and tables. Be precise while answering questions."""
39
+ parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsing_instruction) # type: ignore
40
+ parsed_data = parser.load_data(pdf_path)
 
 
 
41
 
42
+ with open(parsed_data_file, "wb") as f:
43
+ pickle.dump(parsed_data, f)
 
 
 
 
44
 
45
  return parsed_data
46
 
47
+ # Main vector DB builder
48
+ def create_vector_database(pdf_path):
49
+ print("🧠 Starting vector DB creation...")
50
 
51
+ parsed_docs = load_or_parse_data(pdf_path)
52
+ if not parsed_docs:
53
+ raise ValueError("❌ No parsed documents returned from LlamaParse!")
 
54
 
55
+ os.makedirs(md_directory, exist_ok=True)
 
 
56
 
57
+ # Write Markdown content to file (overwrite)
58
+ with open(output_md, 'w', encoding='utf-8') as f:
59
+ for doc in parsed_docs:
60
+ if hasattr(doc, "text") and doc.text.strip():
61
+ f.write(doc.text.strip() + "\n\n")
62
 
63
+ # Ensure .md file was written
64
+ if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
65
+ raise RuntimeError("❌ Markdown file was not created or is empty!")
 
 
 
66
 
67
+ # Load documents
68
+ try:
69
+ loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True)
70
+ documents = loader.load()
71
+ except Exception as e:
72
+ print("⚠️ DirectoryLoader failed, falling back to TextLoader...")
73
+ documents = TextLoader(output_md, encoding='utf-8').load()
74
 
75
+ if not documents:
76
+ raise RuntimeError("❌ No documents loaded from markdown!")
77
 
78
+ # Split documents
79
+ splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
80
+ docs = splitter.split_documents(documents)
81
+ print(f"✅ Loaded and split {len(docs)} chunks.")
82
 
83
+ # Embedding
84
+ embeddings = FastEmbedEmbeddings() # type: ignore
85
 
86
+ # Create vector store
87
+ print("📦 Creating Qdrant vector DB...")
88
  qdrant = Qdrant.from_documents(
89
  documents=docs,
90
  embedding=embeddings,
91
+ path=os.path.join("data", "local_qdrant"),
92
+ collection_name=collection_name,
 
 
93
  )
94
+
95
+ print("✅ Vector DB created successfully.")
96
+ return qdrant