File size: 2,464 Bytes
b296661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec
from uuid import uuid4
import json
import os
from dotenv import load_dotenv
import sys

import time

load_dotenv()

BEGIN = int(sys.argv[1])
END = int(sys.argv[2])
PATH = sys.argv[3]

# Pinecone setup

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = sys.argv[4]
index = pc.Index(INDEX_NAME)

print("Loading JSON...")
meta = json.load(open(PATH))


model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

print("Initializing Pinecone index...")

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = PineconeVectorStore(index=index, embedding=embeddings)



text_splitter = RecursiveCharacterTextSplitter(

    chunk_size=1000,

    chunk_overlap=100,

    length_function=len,

    separators=["\n\n", "\n", " ", ""]

)

fields = ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']


print("Beginning Embeddings...")

start = time.time()

full_data = []

for page in meta:
    content = page['data']
    full_data += content
if BEGIN > END:
    slice = content[BEGIN:]
else:
    slice = content[BEGIN:END]

num = 0

for item in slice:

    id = item["id"]
    item_data = item["attributes"]
    print(id, time.time())
    documents = []
    for field in item_data:
        if (field in fields) or ("note" in field):
            entry = str(item_data[field])
            if len(entry) > 1000:
                chunks = text_splitter.split_text(entry)
                for chunk in chunks:
                    documents.append(Document(page_content=chunk, metadata={"source": id, "field": field}))
            else:
                documents.append(Document(page_content=entry, metadata={"source": id, "field": field}))

    if num % 1000 == 0:
        print(num, f"Added vectors to vectorstore at {time.time()} on id {id}")
    print(documents)
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    num += 1

end = time.time()
print(f"Embedded all documents in {end-start} seconds...")