from datasets import load_dataset from datasets import Dataset from langchain.docstore.document import Document as LangchainDocument from sentence_transformers import SentenceTransformer from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader from sentence_transformers import SentenceTransformer from huggingface_hub import Repository, upload_file from datasets import Dataset import pandas as pd import os HF_TOKEN = os.getenv('HF_Token') url = "https://oxyjon.com/blog/" loader = WebBaseLoader(url) document = loader.load() def create_vector_db(): # split the document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50) texts = text_splitter.split_documents(document) print(texts[1]) print(texts[3]) print(texts[17]) df = pd.DataFrame(texts) column_headers = list(df.columns.values) print(column_headers) pd.options.display.max_colwidth = 400 print(df.iloc[[3]]) print(df.iloc[[17]]) dataset = Dataset.from_pandas(df) print("check2b") print(dataset[3]) dataset.push_to_hub("Namitg02/Test",token = HF_TOKEN) if __name__ == "__main__": print("check31") create_vector_db()