Spaces:
Sleeping
Sleeping
File size: 1,001 Bytes
7976e52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Qdrant
from langchain.embeddings import SentenceTransformerEmbeddings
# embeddings = SentenceTransformerEmbeddings(model_name='NeuML/pubmedbert-base-embeddings')
# embeddings = SentenceTransformerEmbeddings(model_name='mixedbread-ai/mxbai-embed-large-v1')
embeddings = SentenceTransformerEmbeddings(model_name='BAAI/bge-large-en')
print(embeddings)
loader = DirectoryLoader('Data/', glob='110106081.pdf', show_progress=True, loader_cls=PyPDFLoader)\
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
url = "http://localhost:6333/"
qdrant = Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=False, collection_name="patent_database")
print("Vector Database created") |