File size: 855 Bytes
7976e52
0619ee4
 
 
 
7976e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name='BAAI/bge-large-en')

print(embeddings)

loader = DirectoryLoader('Data/', glob='110106081.pdf', show_progress=True, loader_cls=PyPDFLoader)\

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

texts = text_splitter.split_documents(documents)

url = "http://localhost:6333/"

qdrant = Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=False, collection_name="patent_database")

print("Vector Database created")