Spaces:
Running
Running
File size: 2,464 Bytes
b296661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from uuid import uuid4
import json
import os
from dotenv import load_dotenv
import sys
import time
load_dotenv()
BEGIN = int(sys.argv[1])
END = int(sys.argv[2])
PATH = sys.argv[3]
# Pinecone setup
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = sys.argv[4]
index = pc.Index(INDEX_NAME)
print("Loading JSON...")
meta = json.load(open(PATH))
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
print("Initializing Pinecone index...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
fields = ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']
print("Beginning Embeddings...")
start = time.time()
full_data = []
for page in meta:
content = page['data']
full_data += content
if BEGIN > END:
slice = content[BEGIN:]
else:
slice = content[BEGIN:END]
num = 0
for item in slice:
id = item["id"]
item_data = item["attributes"]
print(id, time.time())
documents = []
for field in item_data:
if (field in fields) or ("note" in field):
entry = str(item_data[field])
if len(entry) > 1000:
chunks = text_splitter.split_text(entry)
for chunk in chunks:
documents.append(Document(page_content=chunk, metadata={"source": id, "field": field}))
else:
documents.append(Document(page_content=entry, metadata={"source": id, "field": field}))
if num % 1000 == 0:
print(num, f"Added vectors to vectorstore at {time.time()} on id {id}")
print(documents)
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)
num += 1
end = time.time()
print(f"Embedded all documents in {end-start} seconds...")
|