Spaces:
Runtime error
Runtime error
from datasets import load_dataset, concatenate_datasets | |
from datasets import Dataset | |
from langchain.docstore.document import Document as LangchainDocument | |
from sentence_transformers import SentenceTransformer | |
#from langchain_community.document_loaders import WebBaseLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import TextLoader, DirectoryLoader | |
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import Repository, upload_file | |
from datasets import Dataset | |
import pandas as pd | |
import os | |
DATA_PATH='./data' | |
HF_TOKEN = os.getenv('HF_Token') | |
#dataset = load_dataset("Namitg02/Test", split='train', streaming=False) | |
##url = "https://www.webmd.com/" | |
#loader = WebBaseLoader(url) | |
#document = loader.load() | |
def create_vector_db(): | |
loader = DirectoryLoader(DATA_PATH, glob='*.txt', loader_cls=TextLoader, show_progress=True) | |
document =loader.load() | |
# split the document into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50) | |
texts = text_splitter.split_documents(document) | |
print(texts[1]) | |
print(texts[3]) | |
print(texts[17]) | |
df = pd.DataFrame(texts) | |
column_headers = list(df.columns.values) | |
print(column_headers) | |
pd.options.display.max_colwidth = 400 | |
df = df.drop(columns=[1, 2]) | |
print(df.iloc[[3]]) | |
df[0] = df[0].astype('string', errors='raise').copy() | |
datatypes = df.dtypes | |
print(datatypes) | |
df[0] = df[0].str[18:] | |
df[0] = df[0].str[:-2] | |
print(df.iloc[[3]]) | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) | |
print(df.iloc[[17]]) | |
datasettextfile = Dataset.from_pandas(df) | |
print("check2b") | |
print(datasettextfile[3]) | |
datapdf = load_dataset("Namitg02/ADASOF24", split='train', streaming=False) | |
dataset_combine = concatenate_datasets([datasettextfile, datapdf]) | |
dataset_combine.push_to_hub("Namitg02/Test",token = HF_TOKEN) | |
if __name__ == "__main__": | |
print("check31") | |
create_vector_db() |