Enhance combine_all_sources function with robust source processing and logging
Browse files
data/scraping_scripts/process_md_files.py
CHANGED
@@ -136,7 +136,7 @@ SOURCE_CONFIGS = {
|
|
136 |
},
|
137 |
"8-hour_primer": {
|
138 |
"base_url": "",
|
139 |
-
"input_directory": "data/8-hour_primer",
|
140 |
"output_file": "data/8-hour_primer_data.jsonl", # 8-hour Generative AI Primer
|
141 |
"source_name": "8-hour_primer",
|
142 |
"use_include_list": False,
|
@@ -148,7 +148,7 @@ SOURCE_CONFIGS = {
|
|
148 |
},
|
149 |
"llm_developer": {
|
150 |
"base_url": "",
|
151 |
-
"input_directory": "data/llm_developer",
|
152 |
"output_file": "data/llm_developer_data.jsonl", # From Beginner to Advanced LLM Developer
|
153 |
"source_name": "llm_developer",
|
154 |
"use_include_list": False,
|
@@ -160,7 +160,7 @@ SOURCE_CONFIGS = {
|
|
160 |
},
|
161 |
"python_primer": {
|
162 |
"base_url": "",
|
163 |
-
"input_directory": "data/python_primer",
|
164 |
"output_file": "data/python_primer_data.jsonl", # From Beginner to Advanced LLM Developer
|
165 |
"source_name": "python_primer",
|
166 |
"use_include_list": False,
|
@@ -272,21 +272,62 @@ def save_jsonl(data: List[Dict], output_file: str) -> None:
|
|
272 |
|
273 |
|
274 |
def combine_all_sources(sources: List[str]) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
all_data = []
|
276 |
output_file = "data/all_sources_data.jsonl"
|
277 |
-
|
|
|
|
|
|
|
|
|
278 |
for source in sources:
|
279 |
if source not in SOURCE_CONFIGS:
|
280 |
logger.error(f"Unknown source '{source}'. Skipping.")
|
281 |
continue
|
282 |
-
|
|
|
283 |
input_file = SOURCE_CONFIGS[source]["output_file"]
|
284 |
-
logger.info(f"Processing source: {source}")
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
logger.info(f"Total documents combined: {len(all_data)}")
|
291 |
save_jsonl(all_data, output_file)
|
292 |
logger.info(f"Combined data saved to {output_file}")
|
|
|
136 |
},
|
137 |
"8-hour_primer": {
|
138 |
"base_url": "",
|
139 |
+
"input_directory": "data/8-hour_primer", # Path to the directory that contains the Markdown files
|
140 |
"output_file": "data/8-hour_primer_data.jsonl", # 8-hour Generative AI Primer
|
141 |
"source_name": "8-hour_primer",
|
142 |
"use_include_list": False,
|
|
|
148 |
},
|
149 |
"llm_developer": {
|
150 |
"base_url": "",
|
151 |
+
"input_directory": "data/llm_developer", # Path to the directory that contains the Markdown files
|
152 |
"output_file": "data/llm_developer_data.jsonl", # From Beginner to Advanced LLM Developer
|
153 |
"source_name": "llm_developer",
|
154 |
"use_include_list": False,
|
|
|
160 |
},
|
161 |
"python_primer": {
|
162 |
"base_url": "",
|
163 |
+
"input_directory": "data/python_primer", # Path to the directory that contains the Markdown files
|
164 |
"output_file": "data/python_primer_data.jsonl", # From Beginner to Advanced LLM Developer
|
165 |
"source_name": "python_primer",
|
166 |
"use_include_list": False,
|
|
|
272 |
|
273 |
|
274 |
def combine_all_sources(sources: List[str]) -> None:
|
275 |
+
"""
|
276 |
+
Combine JSONL files from multiple sources, preserving existing sources not being processed.
|
277 |
+
|
278 |
+
For example, if sources = ['transformers'], this will:
|
279 |
+
1. Load data from transformers_data.jsonl
|
280 |
+
2. Load data from all other source JSONL files that exist (course files, etc.)
|
281 |
+
3. Combine them all into all_sources_data.jsonl
|
282 |
+
"""
|
283 |
all_data = []
|
284 |
output_file = "data/all_sources_data.jsonl"
|
285 |
+
|
286 |
+
# Track which sources we're processing
|
287 |
+
processed_sources = set()
|
288 |
+
|
289 |
+
# First, add data from sources we're explicitly processing
|
290 |
for source in sources:
|
291 |
if source not in SOURCE_CONFIGS:
|
292 |
logger.error(f"Unknown source '{source}'. Skipping.")
|
293 |
continue
|
294 |
+
|
295 |
+
processed_sources.add(source)
|
296 |
input_file = SOURCE_CONFIGS[source]["output_file"]
|
297 |
+
logger.info(f"Processing updated source: {source} from {input_file}")
|
298 |
+
|
299 |
+
try:
|
300 |
+
source_data = []
|
301 |
+
with open(input_file, "r", encoding="utf-8") as f:
|
302 |
+
for line in f:
|
303 |
+
source_data.append(json.loads(line))
|
304 |
+
|
305 |
+
logger.info(f"Added {len(source_data)} documents from {source}")
|
306 |
+
all_data.extend(source_data)
|
307 |
+
except Exception as e:
|
308 |
+
logger.error(f"Error loading {input_file}: {e}")
|
309 |
+
|
310 |
+
# Now add data from all other sources not being processed
|
311 |
+
for source_name, config in SOURCE_CONFIGS.items():
|
312 |
+
# Skip sources we already processed
|
313 |
+
if source_name in processed_sources:
|
314 |
+
continue
|
315 |
+
|
316 |
+
# Try to load the individual source file
|
317 |
+
source_file = config["output_file"]
|
318 |
+
if os.path.exists(source_file):
|
319 |
+
logger.info(f"Preserving existing source: {source_name} from {source_file}")
|
320 |
+
try:
|
321 |
+
source_data = []
|
322 |
+
with open(source_file, "r", encoding="utf-8") as f:
|
323 |
+
for line in f:
|
324 |
+
source_data.append(json.loads(line))
|
325 |
+
|
326 |
+
logger.info(f"Preserved {len(source_data)} documents from {source_name}")
|
327 |
+
all_data.extend(source_data)
|
328 |
+
except Exception as e:
|
329 |
+
logger.error(f"Error loading {source_file}: {e}")
|
330 |
+
|
331 |
logger.info(f"Total documents combined: {len(all_data)}")
|
332 |
save_jsonl(all_data, output_file)
|
333 |
logger.info(f"Combined data saved to {output_file}")
|