omarsol commited on
Commit
292e4e7
Β·
1 Parent(s): 39d5b2a

Add Python Primer source configuration to markdown processing script

Browse files
data/scraping_scripts/process_md_files.py CHANGED
@@ -2,7 +2,7 @@
2
  Markdown Document Processor for Documentation Sources
3
 
4
  This script processes Markdown (.md) and MDX (.mdx) files from various documentation sources
5
- (such as Hugging Face Transformers, PEFT, TRL, LlamaIndex, and OpenAI Cookbook) and converts
6
  them into a standardized JSONL format for further processing or indexing.
7
 
8
  Key features:
@@ -18,7 +18,7 @@ Key features:
18
  Usage:
19
  python process_md_files.py <source1> <source2> ...
20
 
21
- Where <source1>, <source2>, etc. are one or more of the predefined sources in SOURCE_CONFIGS
22
  (e.g., 'transformers', 'llama_index', 'openai_cookbooks').
23
 
24
  The script processes all Markdown files in the specified input directories (and their subdirectories),
@@ -28,276 +28,6 @@ files represents a single document with metadata and content.
28
  To add or modify sources, update the SOURCE_CONFIGS dictionary at the top of the script.
29
  """
30
 
31
- # import argparse
32
- # import json
33
- # import logging
34
- # import os
35
- # import re
36
- # import uuid
37
- # from typing import Dict, List
38
-
39
- # import tiktoken
40
-
41
- # logging.basicConfig(level=logging.INFO)
42
- # logger = logging.getLogger(__name__)
43
-
44
- # # Configuration for different sources
45
- # SOURCE_CONFIGS = {
46
- # "transformers": {
47
- # "base_url": "https://huggingface.co/docs/transformers/",
48
- # "input_directory": "data/transformers_md_files",
49
- # "output_file": "data/transformers_data.jsonl",
50
- # "source_name": "transformers",
51
- # "use_include_list": False,
52
- # "included_dirs": [],
53
- # "excluded_dirs": ["internal", "main_classes"],
54
- # "excluded_root_files": [],
55
- # "included_root_files": [],
56
- # "url_extension": "",
57
- # },
58
- # "peft": {
59
- # "base_url": "https://huggingface.co/docs/peft/",
60
- # "input_directory": "data/peft_md_files",
61
- # "output_file": "data/peft_data.jsonl",
62
- # "source_name": "peft",
63
- # "use_include_list": False,
64
- # "included_dirs": [],
65
- # "excluded_dirs": [],
66
- # "excluded_root_files": [],
67
- # "included_root_files": [],
68
- # "url_extension": "",
69
- # },
70
- # "trl": {
71
- # "base_url": "https://huggingface.co/docs/trl/",
72
- # "input_directory": "data/trl_md_files",
73
- # "output_file": "data/trl_data.jsonl",
74
- # "source_name": "trl",
75
- # "use_include_list": False,
76
- # "included_dirs": [],
77
- # "excluded_dirs": [],
78
- # "excluded_root_files": [],
79
- # "included_root_files": [],
80
- # "url_extension": "",
81
- # },
82
- # "llama_index": {
83
- # "base_url": "https://docs.llamaindex.ai/en/stable/",
84
- # "input_directory": "data/llama_index_md_files",
85
- # "output_file": "data/llama_index_data.jsonl",
86
- # "source_name": "llama_index",
87
- # "use_include_list": True,
88
- # "included_dirs": [
89
- # "getting_started",
90
- # "understanding",
91
- # "use_cases",
92
- # "examples",
93
- # "module_guides",
94
- # "optimizing",
95
- # ],
96
- # "excluded_dirs": [],
97
- # "excluded_root_files": [],
98
- # "included_root_files": ["index.md"],
99
- # "url_extension": "",
100
- # },
101
- # "openai_cookbooks": {
102
- # "base_url": "https://github.com/openai/openai-cookbook/blob/main/examples/",
103
- # "input_directory": "data/openai-cookbook_md_files",
104
- # "output_file": "data/openai_cookbooks_data.jsonl",
105
- # "source_name": "openai_cookbooks",
106
- # "use_include_list": False,
107
- # "included_dirs": [],
108
- # "excluded_dirs": [],
109
- # "excluded_root_files": [],
110
- # "included_root_files": [],
111
- # "url_extension": ".ipynb",
112
- # },
113
- # "langchain": {
114
- # "base_url": "https://python.langchain.com/v0.2/docs/",
115
- # "input_directory": "data/langchain_md_files",
116
- # "output_file": "data/langchain_data.jsonl",
117
- # "source_name": "langchain",
118
- # "use_include_list": True,
119
- # "included_dirs": ["how_to", "versions", "turorials", "integrations"],
120
- # "excluded_dirs": [],
121
- # "excluded_root_files": [],
122
- # "included_root_files": ["security.md", "concepts.mdx", "introduction.mdx"],
123
- # "url_extension": "",
124
- # },
125
- # "tai_blog": {
126
- # "base_url": "",
127
- # "input_directory": "",
128
- # "output_file": "data/tai_blog_data.jsonl",
129
- # "source_name": "tai_blog",
130
- # "use_include_list": False,
131
- # "included_dirs": [],
132
- # "excluded_dirs": [],
133
- # "excluded_root_files": [],
134
- # "included_root_files": [],
135
- # "url_extension": "",
136
- # },
137
- # }
138
-
139
-
140
- # def extract_title(content: str):
141
- # title_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
142
- # if title_match:
143
- # return title_match.group(1).strip()
144
-
145
- # lines = content.split("\n")
146
- # for line in lines:
147
- # if line.strip():
148
- # return line.strip()
149
-
150
- # return None
151
-
152
-
153
- # def generate_url(file_path: str, config: Dict) -> str:
154
- # path_without_extension = os.path.splitext(file_path)[0]
155
- # path_with_forward_slashes = path_without_extension.replace("\\", "/")
156
- # return config["base_url"] + path_with_forward_slashes + config["url_extension"]
157
-
158
-
159
- # def should_include_file(file_path: str, config: Dict) -> bool:
160
- # if os.path.dirname(file_path) == "":
161
- # if config["use_include_list"]:
162
- # return os.path.basename(file_path) in config["included_root_files"]
163
- # else:
164
- # return os.path.basename(file_path) not in config["excluded_root_files"]
165
-
166
- # if config["use_include_list"]:
167
- # return any(file_path.startswith(dir) for dir in config["included_dirs"])
168
- # else:
169
- # return not any(file_path.startswith(dir) for dir in config["excluded_dirs"])
170
-
171
-
172
- # def num_tokens_from_string(string: str, encoding_name: str) -> int:
173
- # encoding = tiktoken.get_encoding(encoding_name)
174
- # num_tokens = len(
175
- # encoding.encode(
176
- # string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
177
- # )
178
- # )
179
- # return num_tokens
180
-
181
-
182
- # def remove_copyright_header(content: str) -> str:
183
- # header_pattern = re.compile(r"<!--Copyright.*?-->\s*", re.DOTALL)
184
- # cleaned_content = header_pattern.sub("", content, count=1)
185
- # return cleaned_content.strip()
186
-
187
-
188
- # def process_md_files(directory: str, config: Dict) -> List[Dict]:
189
- # jsonl_data = []
190
-
191
- # for root, _, files in os.walk(directory):
192
- # for file in files:
193
- # if file.endswith(".md") or file.endswith(".mdx"):
194
- # file_path = os.path.join(root, file)
195
- # relative_path = os.path.relpath(file_path, directory)
196
-
197
- # if should_include_file(relative_path, config):
198
- # with open(file_path, "r", encoding="utf-8") as f:
199
- # content = f.read()
200
-
201
- # title = extract_title(content)
202
- # token_count = num_tokens_from_string(content, "cl100k_base")
203
-
204
- # if token_count < 100 or token_count > 200_000:
205
- # logger.info(
206
- # f"Skipping {relative_path} due to token count {token_count}"
207
- # )
208
- # continue
209
-
210
- # cleaned_content = remove_copyright_header(content)
211
-
212
- # json_object = {
213
- # "tokens": token_count,
214
- # "doc_id": str(uuid.uuid4()),
215
- # "name": (title if title else file),
216
- # "url": generate_url(relative_path, config),
217
- # "retrieve_doc": (token_count <= 8000),
218
- # "source": config["source_name"],
219
- # "content": cleaned_content,
220
- # }
221
-
222
- # jsonl_data.append(json_object)
223
-
224
- # return jsonl_data
225
-
226
-
227
- # def save_jsonl(data: List[Dict], output_file: str) -> None:
228
- # with open(output_file, "w", encoding="utf-8") as f:
229
- # for item in data:
230
- # json.dump(item, f, ensure_ascii=False)
231
- # f.write("\n")
232
-
233
-
234
- # def combine_all_sources(sources: List[str]) -> None:
235
- # all_data = []
236
- # output_file = "data/all_sources_data.jsonl"
237
-
238
- # for source in sources:
239
- # if source not in SOURCE_CONFIGS:
240
- # logger.error(f"Unknown source '{source}'. Skipping.")
241
- # continue
242
-
243
- # input_file = SOURCE_CONFIGS[source]["output_file"]
244
- # logger.info(f"Processing source: {source}")
245
-
246
- # with open(input_file, "r", encoding="utf-8") as f:
247
- # for line in f:
248
- # all_data.append(json.loads(line))
249
-
250
- # logger.info(f"Total documents combined: {len(all_data)}")
251
- # save_jsonl(all_data, output_file)
252
- # logger.info(f"Combined data saved to {output_file}")
253
-
254
-
255
- # def process_source(source: str) -> None:
256
- # if source not in SOURCE_CONFIGS:
257
- # logger.error(f"Unknown source '{source}'. Skipping.")
258
- # return
259
-
260
- # config = SOURCE_CONFIGS[source]
261
- # logger.info(f"\n\nProcessing source: {source}")
262
- # jsonl_data = process_md_files(config["input_directory"], config)
263
- # save_jsonl(jsonl_data, config["output_file"])
264
- # logger.info(
265
- # f"Processed {len(jsonl_data)} files and saved to {config['output_file']}"
266
- # )
267
-
268
-
269
- # def main(sources: List[str]) -> None:
270
- # for source in sources:
271
- # process_source(source)
272
-
273
- # if len(sources) > 1:
274
- # # sources = [
275
- # # "transformers",
276
- # # "peft",
277
- # # "trl",
278
- # # "llama_index",
279
- # # "langchain",
280
- # # "openai_cookbooks",
281
- # # "tai_blog",
282
- # # ]
283
- # combine_all_sources(sources)
284
-
285
-
286
- # if __name__ == "__main__":
287
- # parser = argparse.ArgumentParser(
288
- # description="Process Markdown files from specified sources."
289
- # )
290
- # parser.add_argument(
291
- # "sources",
292
- # nargs="+",
293
- # choices=SOURCE_CONFIGS.keys(),
294
- # help="Specify one or more sources to process",
295
- # )
296
- # args = parser.parse_args()
297
-
298
- # main(args.sources)
299
-
300
-
301
  import argparse
302
  import json
303
  import logging
@@ -428,6 +158,18 @@ SOURCE_CONFIGS = {
428
  "included_root_files": [],
429
  "url_extension": "",
430
  },
 
 
 
 
 
 
 
 
 
 
 
 
431
  }
432
 
433
 
 
2
  Markdown Document Processor for Documentation Sources
3
 
4
  This script processes Markdown (.md) and MDX (.mdx) files from various documentation sources
5
+ (such as Hugging Face Transformers, PEFT, TRL, LlamaIndex, and OpenAI Cookbook) and converts
6
  them into a standardized JSONL format for further processing or indexing.
7
 
8
  Key features:
 
18
  Usage:
19
  python process_md_files.py <source1> <source2> ...
20
 
21
+ Where <source1>, <source2>, etc. are one or more of the predefined sources in SOURCE_CONFIGS
22
  (e.g., 'transformers', 'llama_index', 'openai_cookbooks').
23
 
24
  The script processes all Markdown files in the specified input directories (and their subdirectories),
 
28
  To add or modify sources, update the SOURCE_CONFIGS dictionary at the top of the script.
29
  """
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  import argparse
32
  import json
33
  import logging
 
158
  "included_root_files": [],
159
  "url_extension": "",
160
  },
161
+ "python_primer": {
162
+ "base_url": "",
163
+ "input_directory": "data/python_primer",
164
+ "output_file": "data/python_primer_data.jsonl", # From Beginner to Advanced LLM Developer
165
+ "source_name": "python_primer",
166
+ "use_include_list": False,
167
+ "included_dirs": [],
168
+ "excluded_dirs": [],
169
+ "excluded_root_files": [],
170
+ "included_root_files": [],
171
+ "url_extension": "",
172
+ },
173
  }
174
 
175