File size: 1,176 Bytes
15592c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import os
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader
from PyPDF2 import PdfReader
from dotenv import load_dotenv
load_dotenv()
def create_index(file_path: str) -> None:
reader = PdfReader(file_path)
text = ''
for page in reader.pages:
text += page.extract_text()
with open('output.txt', 'w') as file:
file.write(text)
loader = DirectoryLoader(
'./',
glob='**/*.txt',
loader_cls=TextLoader
)
documents = loader.load()
text_splitter = CharacterTextSplitter(
separator='\n',
chunk_size=1024,
chunk_overlap=128
)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(
openai_api_key=os.getenv('OPENAI_API_KEY')
)
persist_directory = 'db'
vectordb = Chroma.from_documents(
documents=texts,
embedding=embeddings,
persist_directory=persist_directory
)
vectordb.persist()
create_index('sample.pdf') |