File size: 3,065 Bytes
49b1fb3
 
 
 
e112463
 
49b1fb3
e112463
 
 
 
49b1fb3
 
 
0ff46a1
 
49b1fb3
 
0ff46a1
 
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
e112463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49b1fb3
0ff46a1
 
49b1fb3
 
 
 
 
 
 
e112463
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import glob
import os
import pickle

import pandas as pd
import tiktoken
from bs4 import BeautifulSoup
from openai.embeddings_utils import cosine_similarity, get_embedding

EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002


def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
    """Parse all HTML files in `root_dir`, and extract all sections.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
    """
    files = glob.glob("*.html", root_dir=root_dir)

    selector = "section > section"

    # Recurse until sections are small enough
    def get_all_subsections(soup, selector: str) -> list[str]:
        found = soup.select(selector)
        data = [x.text.split(";")[-1].strip() for x in found]

        sections = []
        for i, section in enumerate(data):
            if len(section) > max_section_length:
                sections.extend(get_all_subsections(found[i], selector + " > section"))
            else:
                sections.append(section)

        return sections

    sections = []
    for file in files:
        filepath = os.path.join(root_dir, file)
        with open(filepath, "r") as file:
            source = file.read()

        soup = BeautifulSoup(source, "html.parser")
        sections.extend(get_all_subsections(soup, selector))

    return sections


def write_sections(filepath: str, sections: list[str]):
    with open(filepath, "wb") as f:
        pickle.dump(sections, f)


def read_sections(filepath: str) -> list[str]:
    with open(filepath, "rb") as fp:
        sections = pickle.load(fp)

    return sections


def load_documents(fname: str) -> pd.DataFrame:
    df = pd.DataFrame()

    with open(fname, "rb") as fp:
        documents = pickle.load(fp)
    df["documents"] = documents
    return df


def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
    df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
    return df


def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
    return df


def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
    # Get all documents and precompute their embeddings
    df = load_documents(filepath)
    df = compute_n_tokens(df)
    df = precompute_embeddings(df)
    df.to_csv(output_csv)
    return df


if __name__ == "__main__":
    root_dir = "/home/hadrien/perso/mila-docs/output/"
    save_filepath = os.path.join(root_dir, "sections.pkl")

    # How to write
    sections = get_all_sections(root_dir)
    write_sections(save_filepath, sections)

    # How to load
    sections = read_sections(save_filepath)

    # precopmute the document embeddings
    df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")