Spaces:
Running
Running
import os | |
import pandas as pd | |
import tiktoken | |
from langchain.text_splitter import MarkdownHeaderTextSplitter | |
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: | |
encoding = tiktoken.get_encoding(encoding_name) | |
num_tokens = len(encoding.encode(string)) | |
return num_tokens | |
def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500): | |
# drops chunks with abnormally high token counts, usually they contain lots of links | |
filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk] | |
outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk] | |
print(f"Dropping {len(df) - len(filtered_df)} outlier chunks") | |
print(f"Dropped outliers: {outliers_df.content.to_list()}") | |
return filtered_df | |
def find_md_files(folder_path): | |
"""Recursively find .md files, extract content and use filename as title.""" | |
md_files = [] | |
for root, _, files in os.walk(folder_path): | |
for file in files: | |
if file.endswith(".md"): | |
file_path = os.path.join(root, file) | |
title = os.path.splitext(file)[0] | |
with open(file_path, "r", encoding="utf-8") as md_file: | |
content = md_file.read() | |
md_files.append({"title": title, "content": content}) | |
return md_files | |
def split_string_by_max_words(input_string, max_words): | |
words = input_string.split() | |
return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)] | |
if __name__ == "__main__": | |
folder_path = "/path/to/folder/with/md_content/" | |
folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024" | |
md_files = find_md_files(folder_path) | |
headers_to_split_on = [ | |
("#", "#"), | |
("##", "##"), | |
] | |
markdown_splitter = MarkdownHeaderTextSplitter( | |
headers_to_split_on=headers_to_split_on | |
) | |
chunks = [] | |
from tqdm import tqdm | |
for md_file in tqdm(md_files): | |
md_title = md_file["title"] | |
md_raw_content = md_file["content"] | |
md_header_splits = markdown_splitter.split_text(md_raw_content) | |
for split in md_header_splits: | |
# add the headers back to the content | |
headers = "\n".join( | |
[ | |
k + " " + v | |
for k, v in zip(split.metadata.keys(), split.metadata.values()) | |
] | |
) | |
substrings = split_string_by_max_words(split.page_content, max_words=600) | |
for substring in substrings: | |
chunk = { | |
"title": md_title, | |
"content": headers + "\n" + substring, | |
"source": "TAI Course", | |
"url": "https://learn.activeloop.ai/courses/langchain/", | |
} | |
chunks.append(chunk) | |
df = pd.DataFrame(chunks) | |
df = drop_outlier_chunks(df, max_tokens_by_chunk=2000) | |
print(f"Exported {len(df)} chunks from {len(md_files)} articles.") | |
df.to_csv("langchain_course.csv") | |