Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

jerpint commited on Sep 14, 2023

Commit

01b468b

unverified ·

1 Parent(s): 2f93ee4

add markdown parser (#5)

Browse files

Files changed (3) hide show

cfg.py +8 -13
embed_documents.py +1 -6
markdown_parser.py +92 -0

cfg.py CHANGED Viewed

@@ -35,9 +35,9 @@ hf_hub_download(
 extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
 example_questions = [
-    "What's the best way to get a job in AI?",
-    "What is prompt engineering?",
-    "What is generative AI?",
 ]
@@ -50,12 +50,10 @@ buster_cfg = BusterConfig(
         "embedding_model": "text-embedding-ada-002",
         "use_reranking": True,
         "invalid_question_response": "This question does not seem relevant to my current knowledge.",
-        "check_question_prompt": """You are an chatbot answering questions on towardsAI, an artificial intelligence blogs.
-Users will be asking questions about the blog.
-Your job is to determine wether or not a question is a valid question to ask, and should be answered.
-More general questions are not considered valid, even if you might know the response.
-A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
 For example:
@@ -65,7 +63,7 @@ true
 Q: What is the meaning of life?
 false
-A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
         "completion_kwargs": {
             "model": "gpt-3.5-turbo",
             "stream": False,
@@ -130,9 +128,6 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
     },
 )
-# initialize buster with the config in cfg.py (adapt to your needs) ...
-# buster_cfg = cfg.buster_cfg
 def setup_buster(buster_cfg):
     retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)

 extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
 example_questions = [
+    "What is the LLama model?",
+    "What is a LLM?",
+    "What is an embedding?",
 ]
         "embedding_model": "text-embedding-ada-002",
         "use_reranking": True,
         "invalid_question_response": "This question does not seem relevant to my current knowledge.",
+        "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence.
+Users will ask all sorts of questions, and some might be tangentially related.
+Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies.
+As long as a question is somewhat related to the topic, respond 'true'. If a question is completely unrelated, respond 'false'.
 For example:
 Q: What is the meaning of life?
 false
+A user will now submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
         "completion_kwargs": {
             "model": "gpt-3.5-turbo",
             "stream": False,
     },
 )
 def setup_buster(buster_cfg):
     retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)

embed_documents.py CHANGED Viewed

@@ -3,16 +3,11 @@ from buster.documents_manager import DeepLakeDocumentsManager
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
-    chunk_file = "data/output.csv"
     overwrite = True
     df = pd.read_csv(chunk_file)
-    # some pre-processing based on the latest file provided
-    df["url"] = df["source"]
-    df["source"] = "towardsai_blog"
-    df = df.dropna()
     dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
     dm.batch_add(df)
     zipped_file_path = dm.to_zip()

 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
+    chunk_file = "langchain_course.csv"
     overwrite = True
     df = pd.read_csv(chunk_file)
     dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
     dm.batch_add(df)
     zipped_file_path = dm.to_zip()

markdown_parser.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import pandas as pd
+import tiktoken
+from langchain.text_splitter import MarkdownHeaderTextSplitter
+def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500):
+    # drops chunks with abnormally high token counts, usually they contain lots of links
+    filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk]
+    outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk]
+    print(f"Dropping {len(df) - len(filtered_df)} outlier chunks")
+    print(f"Dropped outliers: {outliers_df.content.to_list()}")
+    return filtered_df
+def find_md_files(folder_path):
+    """Recursively find .md files, extract content and use filename as title."""
+    md_files = []
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            if file.endswith(".md"):
+                file_path = os.path.join(root, file)
+                title = os.path.splitext(file)[0]
+                with open(file_path, "r", encoding="utf-8") as md_file:
+                    content = md_file.read()
+                md_files.append({"title": title, "content": content})
+    return md_files
+def split_string_by_max_words(input_string, max_words):
+    words = input_string.split()
+    return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)]
+if __name__ == "__main__":
+    folder_path = "/path/to/folder/with/md_content/"
+    folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024"
+    md_files = find_md_files(folder_path)
+    headers_to_split_on = [
+        ("#", "#"),
+        ("##", "##"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on
+    )
+    chunks = []
+    from tqdm import tqdm
+    for md_file in tqdm(md_files):
+        md_title = md_file["title"]
+        md_raw_content = md_file["content"]
+        md_header_splits = markdown_splitter.split_text(md_raw_content)
+        for split in md_header_splits:
+            # add the headers back to the content
+            headers = "\n".join(
+                [
+                    k + " " + v
+                    for k, v in zip(split.metadata.keys(), split.metadata.values())
+                ]
+            )
+            substrings = split_string_by_max_words(split.page_content, max_words=600)
+            for substring in substrings:
+                chunk = {
+                    "title": md_title,
+                    "content": headers + "\n" + substring,
+                    "source": "TAI Course",
+                    "url": "https://learn.activeloop.ai/courses/langchain/",
+                }
+                chunks.append(chunk)
+    df = pd.DataFrame(chunks)
+    df = drop_outlier_chunks(df, max_tokens_by_chunk=2000)
+    print(f"Exported {len(df)} chunks from {len(md_files)} articles.")
+    df.to_csv("langchain_course.csv")