from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document import os from config import PERSIST_DIRECTORY def process_safety_with_chroma(data): """ Processes and stores the given structured JSON data into ChromaDB. Args: data (list): A list of dictionaries containing structured JSON data. Returns: Chroma: The Chroma vector store object. """ documents = [] # print("machidkkkk\n") for item in data: # print("machidkkkk\n") # Extract fields from the JSON structure content = item.get("snippet", "") highlighted_words = item.get("snippet_highlighted_words", []) highlighted_words_str = ", ".join(highlighted_words) if isinstance(highlighted_words, list) else str(highlighted_words) metadata = { "position": item.get("position"), "title": item.get("title"), "link": item.get("link"), "source": item.get("source"), "displayed_link": item.get("displayed_link"), # Flatten highlighted_words list into a comma-separated string "highlighted_words": ", ".join(highlighted_words) if isinstance(highlighted_words, list) else highlighted_words } # Create a document for each snippet # print("ffffff") # print ( "content", content) if content: content += f" Highlighted words: {highlighted_words_str}" if highlighted_words_str else "" documents.append(Document(page_content=content, metadata=metadata)) # Initialize embeddings and Chroma store embeddings = OpenAIEmbeddings() vector_store = Chroma.from_documents(documents, embeddings, persist_directory=PERSIST_DIRECTORY) return vector_store