AlexanderKazakov
small improvement for chunking; openai embeddings
eeafaaa
raw
history blame
1.9 kB
import shutil
import lancedb
import openai
import pyarrow as pa
import pandas as pd
from pathlib import Path
import tqdm
import numpy as np
from gradio_app.backend.embedders import EmbedderFactory
from markdown_to_text import *
from settings import *
with open('data/openaikey.txt') as f:
OPENAI_KEY = f.read().strip()
openai.api_key = OPENAI_KEY
shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
db = lancedb.connect(LANCEDB_DIRECTORY)
batch_size = 32
schema = pa.schema([
pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMBED_NAME])),
pa.field(TEXT_COLUMN_NAME, pa.string()),
pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
])
tbl = db.create_table(LANCEDB_TABLE_NAME, schema=schema, mode="overwrite")
input_dir = Path(MARKDOWN_SOURCE_DIR)
files = list(input_dir.rglob("*"))
chunks = []
for file in files:
if not os.path.isfile(file):
continue
file_path, file_ext = os.path.splitext(os.path.relpath(file, input_dir))
if file_ext != '.md':
print(f'Skipped {file_ext} extension: {file}')
continue
with open(file, encoding='utf-8') as f:
f = f.read()
f = remove_comments(f)
f = split_markdown(f)
chunks.extend((chunk, os.path.abspath(file)) for chunk in f)
from matplotlib import pyplot as plt
plt.hist([len(c) for c, d in chunks], bins=100)
plt.show()
embedder = EmbedderFactory.get_embedder(EMBED_NAME)
for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
texts, doc_paths = [], []
for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
if len(text) > 0:
texts.append(text)
doc_paths.append(doc_path)
encoded = embedder.embed(texts)
df = pd.DataFrame({
VECTOR_COLUMN_NAME: encoded,
TEXT_COLUMN_NAME: texts,
DOCUMENT_PATH_COLUMN_NAME: doc_paths,
})
tbl.add(df)