Spaces:

NEXAS
/

multimodal

Running

App Files Files Community

NEXAS commited on 16 days ago

Commit

fc2f873

verified ·

1 Parent(s): cb7d229

Update src/utils/ingest_text.py

Browse files

Files changed (1) hide show

src/utils/ingest_text.py +17 -14

src/utils/ingest_text.py CHANGED Viewed

@@ -23,9 +23,10 @@ llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
 groq_api_key = os.getenv("GROQ_API_KEY")
 # Paths
-parsed_data_file = os.path.join("data", "parsed_data.pkl")
-output_md = os.path.join("data", "output.md")
-md_directory = "data"
 collection_name = "rag"
 # Helper: Load or parse PDF
@@ -48,47 +49,49 @@ def load_or_parse_data(pdf_path):
 def create_vector_database(pdf_path):
     print("🧠 Starting vector DB creation...")
     parsed_docs = load_or_parse_data(pdf_path)
     if not parsed_docs:
         raise ValueError("❌ No parsed documents returned from LlamaParse!")
-    os.makedirs(md_directory, exist_ok=True)
-    # Write Markdown content to file (overwrite)
     with open(output_md, 'w', encoding='utf-8') as f:
         for doc in parsed_docs:
             if hasattr(doc, "text") and doc.text.strip():
                 f.write(doc.text.strip() + "\n\n")
-    # Ensure .md file was written
     if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
         raise RuntimeError("❌ Markdown file was not created or is empty!")
-    # Load documents
     try:
-        loader = DirectoryLoader(md_directory, glob="**/*.md", show_progress=True)
         documents = loader.load()
     except Exception as e:
-        print("⚠️ DirectoryLoader failed, falling back to TextLoader...")
         documents = TextLoader(output_md, encoding='utf-8').load()
     if not documents:
         raise RuntimeError("❌ No documents loaded from markdown!")
-    # Split documents
     splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
     docs = splitter.split_documents(documents)
     print(f"✅ Loaded and split {len(docs)} chunks.")
-    # Embedding
     embeddings = FastEmbedEmbeddings()  # type: ignore
-    # Create vector store
     print("📦 Creating Qdrant vector DB...")
     qdrant = Qdrant.from_documents(
         documents=docs,
         embedding=embeddings,
-        path=os.path.join("data", "local_qdrant"),
         collection_name=collection_name,
     )

 groq_api_key = os.getenv("GROQ_API_KEY")
 # Paths
+data_dir = "data"
+parsed_data_file = os.path.join(data_dir, "parsed_data.pkl")
+output_md = os.path.join(data_dir, "output.md")
+qdrant_dir = os.path.join(data_dir, "local_qdrant")
 collection_name = "rag"
 # Helper: Load or parse PDF
 def create_vector_database(pdf_path):
     print("🧠 Starting vector DB creation...")
+    # Ensure directories exist
+    os.makedirs(data_dir, exist_ok=True)
+    os.makedirs(qdrant_dir, exist_ok=True)
+    # Parse PDF
     parsed_docs = load_or_parse_data(pdf_path)
     if not parsed_docs:
         raise ValueError("❌ No parsed documents returned from LlamaParse!")
+    # Write Markdown content
     with open(output_md, 'w', encoding='utf-8') as f:
         for doc in parsed_docs:
             if hasattr(doc, "text") and doc.text.strip():
                 f.write(doc.text.strip() + "\n\n")
     if not os.path.exists(output_md) or os.path.getsize(output_md) == 0:
         raise RuntimeError("❌ Markdown file was not created or is empty!")
+    # Load .md as documents
     try:
+        loader = DirectoryLoader(data_dir, glob="**/*.md", show_progress=True)
         documents = loader.load()
     except Exception as e:
+        print(f"⚠️ DirectoryLoader failed: {e}. Falling back to TextLoader...")
         documents = TextLoader(output_md, encoding='utf-8').load()
     if not documents:
         raise RuntimeError("❌ No documents loaded from markdown!")
+    # Chunk documents
     splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
     docs = splitter.split_documents(documents)
     print(f"✅ Loaded and split {len(docs)} chunks.")
+    # Embeddings
     embeddings = FastEmbedEmbeddings()  # type: ignore
+    # Create Qdrant vector DB
     print("📦 Creating Qdrant vector DB...")
     qdrant = Qdrant.from_documents(
         documents=docs,
         embedding=embeddings,
+        path=qdrant_dir,
         collection_name=collection_name,
     )