Spaces:
Runtime error
Runtime error
File size: 3,065 Bytes
49b1fb3 e112463 49b1fb3 e112463 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 e112463 49b1fb3 0ff46a1 49b1fb3 e112463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import glob
import os
import pickle
import pandas as pd
import tiktoken
from bs4 import BeautifulSoup
from openai.embeddings_utils import cosine_similarity, get_embedding
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
"""Parse all HTML files in `root_dir`, and extract all sections.
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
"""
files = glob.glob("*.html", root_dir=root_dir)
selector = "section > section"
# Recurse until sections are small enough
def get_all_subsections(soup, selector: str) -> list[str]:
found = soup.select(selector)
data = [x.text.split(";")[-1].strip() for x in found]
sections = []
for i, section in enumerate(data):
if len(section) > max_section_length:
sections.extend(get_all_subsections(found[i], selector + " > section"))
else:
sections.append(section)
return sections
sections = []
for file in files:
filepath = os.path.join(root_dir, file)
with open(filepath, "r") as file:
source = file.read()
soup = BeautifulSoup(source, "html.parser")
sections.extend(get_all_subsections(soup, selector))
return sections
def write_sections(filepath: str, sections: list[str]):
with open(filepath, "wb") as f:
pickle.dump(sections, f)
def read_sections(filepath: str) -> list[str]:
with open(filepath, "rb") as fp:
sections = pickle.load(fp)
return sections
def load_documents(fname: str) -> pd.DataFrame:
df = pd.DataFrame()
with open(fname, "rb") as fp:
documents = pickle.load(fp)
df["documents"] = documents
return df
def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
return df
def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
return df
def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
# Get all documents and precompute their embeddings
df = load_documents(filepath)
df = compute_n_tokens(df)
df = precompute_embeddings(df)
df.to_csv(output_csv)
return df
if __name__ == "__main__":
root_dir = "/home/hadrien/perso/mila-docs/output/"
save_filepath = os.path.join(root_dir, "sections.pkl")
# How to write
sections = get_all_sections(root_dir)
write_sections(save_filepath, sections)
# How to load
sections = read_sections(save_filepath)
# precopmute the document embeddings
df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
|