import chromadb import os # Ingest Text from llama_parse import LlamaParse from langchain.document_loaders import PyMuPDFLoader from langchain.text_splitter import CharacterTextSplitter import os import pickle import nest_asyncio nest_asyncio.apply() path = "mm_vdb2" client = chromadb.PersistentClient(path=path) llamaparse_api_key = "llx-qXMliHH4UOphFaahO8HEqR5wOj1U6T7oxqC4DoLiik7UvKkJ" groq_api_key = "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" parsed_data_file = r"parsed_data.pkl" output_md = r"output.md" loki = r"data" # Define a function to load parsed data if available, or parse if not def load_or_parse_data(loc): data_file = parsed_data_file if os.path.exists(data_file): # Load the parsed data from the file with open(data_file, "rb") as f: parsed_data = pickle.load(f) else: # Perform the parsing step and store the result in llama_parse_documents parsingInstructiontest10k = """The provided document is an user guide or a manual. It contains many images and tables. Try to be precise while answering the questions""" parser = LlamaParse(api_key=llamaparse_api_key, result_type="markdown", parsing_instruction=parsingInstructiontest10k) # type: ignore llama_parse_documents = parser.load_data(loc) # Save the parsed data to a file with open(data_file, "wb") as f: pickle.dump(llama_parse_documents, f) # Set the parsed data to the variable parsed_data = llama_parse_documents return parsed_data # Create vector database def create_vector_database(loc): """ Creates a vector database using document loaders and embeddings. This function loads urls, splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings, and finally persists the embeddings into a Chroma vector database. """ # Call the function to either load or parse the data data = loc loader = PyMuPDFLoader(file_path=data) docs = loader.load() # This returns a list of pages/documents print(f"Number of documents: {len(docs)}") print("Vector DB started!") # Initialize a list for document content and IDs document_contents = [] ids = [] # Generate unique IDs for each document, with PDF page number first for i, doc in enumerate(docs): # Print metadata to understand its structure print(f"Metadata for document {i+1}: {doc.metadata}") # Try to extract the page number from metadata or use a default page_num = doc.metadata.get('page_number', f'unknown_{i+1}') # Use i+1 to ensure uniqueness # Extract text from each page page_content = doc.page_content # Get the content of the page # Split the content into chunks based on the text splitter text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200) doc_chunks = text_splitter.split_text(page_content) # Add chunk contents and corresponding page-based IDs for chunk_idx, chunk in enumerate(doc_chunks): document_contents.append(chunk) # Add the chunk content ids.append(f"page_{page_num}_chunk_{i+1}_{chunk_idx+1}") # Add a unique chunk ID # Ensure the number of ids matches the number of documents (contents) assert len(ids) == len(document_contents), "Mismatch between number of ids and document contents" # Create or get the text collection text_collection = client.get_or_create_collection(name="text_collection") # Add documents and their embeddings to the collection text_collection.add( documents=document_contents, # All the chunk-level content ids=ids # Matching IDs for each chunk content ) print('Vector DB created successfully!') return text_collection