NEXAS commited on
Commit
fc2f873
Β·
verified Β·
1 Parent(s): cb7d229

Update src/utils/ingest_text.py

Browse files
Files changed (1) hide show
  1. src/utils/ingest_text.py +17 -14
src/utils/ingest_text.py CHANGED
@@ -23,9 +23,10 @@ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
23
  groq_api_key = os.getenv("GROQ_API_KEY")
24
 
25
  # Paths
26
- parsed_data_file = os.path.join("data", "parsed_data.pkl")
27
- output_md = os.path.join("data", "output.md")
28
- md_directory = "data"
 
29
  collection_name = "rag"
30
 
31
  # Helper: Load or parse PDF
@@ -48,47 +49,49 @@ def load_or_parse_data(pdf_path):
48
  def create_vector_database(pdf_path):
49
  print("🧠 Starting vector DB creation...")
50
 
 
 
 
 
 
51
  parsed_docs = load_or_parse_data(pdf_path)
52
  if not parsed_docs:
53
  raise ValueError("❌ No parsed documents returned from LlamaParse!")
54
 
55
- os.makedirs(md_directory, exist_ok=True)
56
-
57
- # Write Markdown content to file (overwrite)
58
  with open(output_md, 'w', encoding='utf-8') as f:
59
  for doc in parsed_docs:
60
  if hasattr(doc, "text") and doc.text.strip():
61
  f.write(doc.text.strip() + "\n\n")
62
 
63
- # Ensure .md file was written
64
  if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
65
  raise RuntimeError("❌ Markdown file was not created or is empty!")
66
 
67
- # Load documents
68
  try:
69
- loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True)
70
  documents = loader.load()
71
  except Exception as e:
72
- print("⚠️ DirectoryLoader failed, falling back to TextLoader...")
73
  documents = TextLoader(output_md, encoding='utf-8').load()
74
 
75
  if not documents:
76
  raise RuntimeError("❌ No documents loaded from markdown!")
77
 
78
- # Split documents
79
  splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
80
  docs = splitter.split_documents(documents)
81
  print(f"βœ… Loaded and split {len(docs)} chunks.")
82
 
83
- # Embedding
84
  embeddings = FastEmbedEmbeddings() # type: ignore
85
 
86
- # Create vector store
87
  print("πŸ“¦ Creating Qdrant vector DB...")
88
  qdrant = Qdrant.from_documents(
89
  documents=docs,
90
  embedding=embeddings,
91
- path=os.path.join("data", "local_qdrant"),
92
  collection_name=collection_name,
93
  )
94
 
 
23
  groq_api_key = os.getenv("GROQ_API_KEY")
24
 
25
  # Paths
26
+ data_dir = "data"
27
+ parsed_data_file = os.path.join(data_dir, "parsed_data.pkl")
28
+ output_md = os.path.join(data_dir, "output.md")
29
+ qdrant_dir = os.path.join(data_dir, "local_qdrant")
30
  collection_name = "rag"
31
 
32
  # Helper: Load or parse PDF
 
49
  def create_vector_database(pdf_path):
50
  print("🧠 Starting vector DB creation...")
51
 
52
+ # Ensure directories exist
53
+ os.makedirs(data_dir, exist_ok=True)
54
+ os.makedirs(qdrant_dir, exist_ok=True)
55
+
56
+ # Parse PDF
57
  parsed_docs = load_or_parse_data(pdf_path)
58
  if not parsed_docs:
59
  raise ValueError("❌ No parsed documents returned from LlamaParse!")
60
 
61
+ # Write Markdown content
 
 
62
  with open(output_md, 'w', encoding='utf-8') as f:
63
  for doc in parsed_docs:
64
  if hasattr(doc, "text") and doc.text.strip():
65
  f.write(doc.text.strip() + "\n\n")
66
 
 
67
  if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
68
  raise RuntimeError("❌ Markdown file was not created or is empty!")
69
 
70
+ # Load .md as documents
71
  try:
72
+ loader = DirectoryLoader(data_dir, glob="**/*.md", show_progress=True)
73
  documents = loader.load()
74
  except Exception as e:
75
+ print(f"⚠️ DirectoryLoader failed: {e}. Falling back to TextLoader...")
76
  documents = TextLoader(output_md, encoding='utf-8').load()
77
 
78
  if not documents:
79
  raise RuntimeError("❌ No documents loaded from markdown!")
80
 
81
+ # Chunk documents
82
  splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
83
  docs = splitter.split_documents(documents)
84
  print(f"βœ… Loaded and split {len(docs)} chunks.")
85
 
86
+ # Embeddings
87
  embeddings = FastEmbedEmbeddings() # type: ignore
88
 
89
+ # Create Qdrant vector DB
90
  print("πŸ“¦ Creating Qdrant vector DB...")
91
  qdrant = Qdrant.from_documents(
92
  documents=docs,
93
  embedding=embeddings,
94
+ path=qdrant_dir,
95
  collection_name=collection_name,
96
  )
97