Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

buster-dev / buster /docparser.py

jerpint

add chatbot (#2)

e112463 unverified over 2 years ago

raw

history blame

3.07 kB

	import glob
	import os
	import pickle

	import pandas as pd
	import tiktoken
	from bs4 import BeautifulSoup
	from openai.embeddings_utils import cosine_similarity, get_embedding

	EMBEDDING_MODEL = "text-embedding-ada-002"
	EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002


	def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
	"""Parse all HTML files in `root_dir`, and extract all sections.

	Sections are broken into subsections if they are longer than `max_section_length`.
	Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
	"""
	files = glob.glob("*.html", root_dir=root_dir)

	selector = "section > section"

	# Recurse until sections are small enough
	def get_all_subsections(soup, selector: str) -> list[str]:
	found = soup.select(selector)
	data = [x.text.split(";")[-1].strip() for x in found]

	sections = []
	for i, section in enumerate(data):
	if len(section) > max_section_length:
	sections.extend(get_all_subsections(found[i], selector + " > section"))
	else:
	sections.append(section)

	return sections

	sections = []
	for file in files:
	filepath = os.path.join(root_dir, file)
	with open(filepath, "r") as file:
	source = file.read()

	soup = BeautifulSoup(source, "html.parser")
	sections.extend(get_all_subsections(soup, selector))

	return sections


	def write_sections(filepath: str, sections: list[str]):
	with open(filepath, "wb") as f:
	pickle.dump(sections, f)


	def read_sections(filepath: str) -> list[str]:
	with open(filepath, "rb") as fp:
	sections = pickle.load(fp)

	return sections


	def load_documents(fname: str) -> pd.DataFrame:
	df = pd.DataFrame()

	with open(fname, "rb") as fp:
	documents = pickle.load(fp)
	df["documents"] = documents
	return df


	def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
	encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
	df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
	return df


	def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
	df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
	return df


	def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
	# Get all documents and precompute their embeddings
	df = load_documents(filepath)
	df = compute_n_tokens(df)
	df = precompute_embeddings(df)
	df.to_csv(output_csv)
	return df


	if __name__ == "__main__":
	root_dir = "/home/hadrien/perso/mila-docs/output/"
	save_filepath = os.path.join(root_dir, "sections.pkl")

	# How to write
	sections = get_all_sections(root_dir)
	write_sections(save_filepath, sections)

	# How to load
	sections = read_sections(save_filepath)

	# precopmute the document embeddings
	df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")