Spaces:
Runtime error
Runtime error
File size: 2,137 Bytes
77a8320 a602362 d1608e1 b1db484 a602362 d1608e1 a602362 6a8def7 8eb4c46 6a8def7 d1608e1 6a8def7 a602362 d1608e1 7565a64 a84f227 090ffa5 d1608e1 7565a64 a602362 7565a64 8eb4c46 c40364d 874f99e c40364d 7565a64 2d1dde1 8eb4c46 77a8320 6a8def7 77a8320 6a8def7 a602362 7565a64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from datasets import load_dataset, concatenate_datasets
from datasets import Dataset
from langchain.docstore.document import Document as LangchainDocument
from sentence_transformers import SentenceTransformer
#from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from sentence_transformers import SentenceTransformer
from huggingface_hub import Repository, upload_file
from datasets import Dataset
import pandas as pd
import os
DATA_PATH='./data'
HF_TOKEN = os.getenv('HF_Token')
#dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
##url = "https://www.webmd.com/"
#loader = WebBaseLoader(url)
#document = loader.load()
def create_vector_db():
loader = DirectoryLoader(DATA_PATH, glob='*.txt', loader_cls=TextLoader, show_progress=True)
document =loader.load()
# split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
texts = text_splitter.split_documents(document)
print(texts[1])
print(texts[3])
print(texts[17])
df = pd.DataFrame(texts)
column_headers = list(df.columns.values)
print(column_headers)
pd.options.display.max_colwidth = 400
df = df.drop(columns=[1, 2])
print(df.iloc[[3]])
df[0] = df[0].astype('string', errors='raise').copy()
datatypes = df.dtypes
print(datatypes)
df[0] = df[0].str[18:]
df[0] = df[0].str[:-2]
print(df.iloc[[3]])
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x))
print(df.iloc[[17]])
datasettextfile = Dataset.from_pandas(df)
print("check2b")
print(datasettextfile[3])
datapdf = load_dataset("Namitg02/ADASOF24", split='train', streaming=False)
dataset_combine = concatenate_datasets([datasettextfile, datapdf])
dataset_combine.push_to_hub("Namitg02/Test",token = HF_TOKEN)
if __name__ == "__main__":
print("check31")
create_vector_db() |