File size: 2,008 Bytes
8cb8290 a106116 8cb8290 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import logging
logger = logging.getLogger()
import os
from langchain.docstore.document import Document
from langchain.text_splitter import NLTKTextSplitter
from langchain.callbacks import get_openai_callback
from config import State
FILE_DIR = 'files'
KURS_URL = "https://www.kth.se/student/kurser/kurs/{course_code}?l={language}"
DEFAULT_LANGUAGE = "en"
CHUNK_SIZE = 1000
def ingest(state: State):
with get_openai_callback() as cb:
# make sure pwd is kth_qa
pwd = os.getcwd()
if pwd.split('/')[-1] != 'kth_qa':
logger.error(f"pwd is not kth_qa, but {pwd}. Please run from kth_qa directory.")
return
text_splitter = NLTKTextSplitter.from_tiktoken_encoder(
chunk_size=CHUNK_SIZE,
chunk_overlap=100,
)
file_folder_name = f'files/{DEFAULT_LANGUAGE}'
file_folder = os.listdir(file_folder_name)
all_langdocs = []
for file in file_folder:
raw_docs = []
with open(f'{file_folder_name}/{file}', 'r') as f:
text = f.read()
filename = file.split('.')[0]
course_code, language = filename.split('?l=')
doc = Document(page_content=text, metadata={"course": course_code})
raw_docs.append(doc)
logger.debug(f"loaded file {file}")
langdocs = text_splitter.split_documents(raw_docs)
logger.debug(f"split documents into {len(langdocs)} chunks")
all_langdocs.extend(langdocs)
logger.info(f"split all documents into {len(all_langdocs)} chunks")
logger.info(f"Adding documents to pinecone...")
state.store.add_documents(all_langdocs)
logger.info(f"...done!")
logger.info(f"Total cost of openai api calls: {cb.total_cost}")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
logger.setLevel(logging.INFO)
state = State()
ingest(state) |