omarsol commited on
Commit
75ceb60
Β·
1 Parent(s): b5197fc

Enhance combine_all_sources function with robust source processing and logging

Browse files
data/scraping_scripts/process_md_files.py CHANGED
@@ -136,7 +136,7 @@ SOURCE_CONFIGS = {
136
  },
137
  "8-hour_primer": {
138
  "base_url": "",
139
- "input_directory": "data/8-hour_primer",
140
  "output_file": "data/8-hour_primer_data.jsonl", # 8-hour Generative AI Primer
141
  "source_name": "8-hour_primer",
142
  "use_include_list": False,
@@ -148,7 +148,7 @@ SOURCE_CONFIGS = {
148
  },
149
  "llm_developer": {
150
  "base_url": "",
151
- "input_directory": "data/llm_developer",
152
  "output_file": "data/llm_developer_data.jsonl", # From Beginner to Advanced LLM Developer
153
  "source_name": "llm_developer",
154
  "use_include_list": False,
@@ -160,7 +160,7 @@ SOURCE_CONFIGS = {
160
  },
161
  "python_primer": {
162
  "base_url": "",
163
- "input_directory": "data/python_primer",
164
  "output_file": "data/python_primer_data.jsonl", # From Beginner to Advanced LLM Developer
165
  "source_name": "python_primer",
166
  "use_include_list": False,
@@ -272,21 +272,62 @@ def save_jsonl(data: List[Dict], output_file: str) -> None:
272
 
273
 
274
  def combine_all_sources(sources: List[str]) -> None:
 
 
 
 
 
 
 
 
275
  all_data = []
276
  output_file = "data/all_sources_data.jsonl"
277
-
 
 
 
 
278
  for source in sources:
279
  if source not in SOURCE_CONFIGS:
280
  logger.error(f"Unknown source '{source}'. Skipping.")
281
  continue
282
-
 
283
  input_file = SOURCE_CONFIGS[source]["output_file"]
284
- logger.info(f"Processing source: {source}")
285
-
286
- with open(input_file, "r", encoding="utf-8") as f:
287
- for line in f:
288
- all_data.append(json.loads(line))
289
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  logger.info(f"Total documents combined: {len(all_data)}")
291
  save_jsonl(all_data, output_file)
292
  logger.info(f"Combined data saved to {output_file}")
 
136
  },
137
  "8-hour_primer": {
138
  "base_url": "",
139
+ "input_directory": "data/8-hour_primer", # Path to the directory that contains the Markdown files
140
  "output_file": "data/8-hour_primer_data.jsonl", # 8-hour Generative AI Primer
141
  "source_name": "8-hour_primer",
142
  "use_include_list": False,
 
148
  },
149
  "llm_developer": {
150
  "base_url": "",
151
+ "input_directory": "data/llm_developer", # Path to the directory that contains the Markdown files
152
  "output_file": "data/llm_developer_data.jsonl", # From Beginner to Advanced LLM Developer
153
  "source_name": "llm_developer",
154
  "use_include_list": False,
 
160
  },
161
  "python_primer": {
162
  "base_url": "",
163
+ "input_directory": "data/python_primer", # Path to the directory that contains the Markdown files
164
  "output_file": "data/python_primer_data.jsonl", # From Beginner to Advanced LLM Developer
165
  "source_name": "python_primer",
166
  "use_include_list": False,
 
272
 
273
 
274
  def combine_all_sources(sources: List[str]) -> None:
275
+ """
276
+ Combine JSONL files from multiple sources, preserving existing sources not being processed.
277
+
278
+ For example, if sources = ['transformers'], this will:
279
+ 1. Load data from transformers_data.jsonl
280
+ 2. Load data from all other source JSONL files that exist (course files, etc.)
281
+ 3. Combine them all into all_sources_data.jsonl
282
+ """
283
  all_data = []
284
  output_file = "data/all_sources_data.jsonl"
285
+
286
+ # Track which sources we're processing
287
+ processed_sources = set()
288
+
289
+ # First, add data from sources we're explicitly processing
290
  for source in sources:
291
  if source not in SOURCE_CONFIGS:
292
  logger.error(f"Unknown source '{source}'. Skipping.")
293
  continue
294
+
295
+ processed_sources.add(source)
296
  input_file = SOURCE_CONFIGS[source]["output_file"]
297
+ logger.info(f"Processing updated source: {source} from {input_file}")
298
+
299
+ try:
300
+ source_data = []
301
+ with open(input_file, "r", encoding="utf-8") as f:
302
+ for line in f:
303
+ source_data.append(json.loads(line))
304
+
305
+ logger.info(f"Added {len(source_data)} documents from {source}")
306
+ all_data.extend(source_data)
307
+ except Exception as e:
308
+ logger.error(f"Error loading {input_file}: {e}")
309
+
310
+ # Now add data from all other sources not being processed
311
+ for source_name, config in SOURCE_CONFIGS.items():
312
+ # Skip sources we already processed
313
+ if source_name in processed_sources:
314
+ continue
315
+
316
+ # Try to load the individual source file
317
+ source_file = config["output_file"]
318
+ if os.path.exists(source_file):
319
+ logger.info(f"Preserving existing source: {source_name} from {source_file}")
320
+ try:
321
+ source_data = []
322
+ with open(source_file, "r", encoding="utf-8") as f:
323
+ for line in f:
324
+ source_data.append(json.loads(line))
325
+
326
+ logger.info(f"Preserved {len(source_data)} documents from {source_name}")
327
+ all_data.extend(source_data)
328
+ except Exception as e:
329
+ logger.error(f"Error loading {source_file}: {e}")
330
+
331
  logger.info(f"Total documents combined: {len(all_data)}")
332
  save_jsonl(all_data, output_file)
333
  logger.info(f"Combined data saved to {output_file}")