# Assuming sanitize_text is a function you've defined elsewhere import re def merge_documents(main_dict, additional_json, limit=1000): """ Adds a subset of documents from an additional JSON file to the main dictionary. Args: main_dict (dict): The main dictionary where processed documents are stored. additional_json (list): The additional JSON data containing documents. limit (int): The maximum number of documents to add to the main dictionary. Returns: dict: The updated main dictionary with additional documents added. """ # Counter to track how many documents have been added count = 0 for doc in additional_json: if count >= limit: break # Extract wikipedia_id and text from the document wikipedia_id = doc.get("wikipedia_id") text = doc.get("text", []) # Check if the document ID is unique to avoid overwriting if wikipedia_id not in main_dict: # Process and sanitize the document joined_text = " ".join(text) sanitized_text = sanitize_text(joined_text) # Add to the main dictionary main_dict[wikipedia_id] = sanitized_text count += 1 print(f"{count} documents added to the main dictionary.") return main_dict def sanitize_text(text): """ Cleans and standardizes text by keeping only alphanumeric characters and spaces. Args: text (str): Text to sanitize. Returns: str: Sanitized text. """ if isinstance(text, str): # Use regex to keep only alphanumeric characters and spaces text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Optionally, collapse multiple spaces into a single space text = re.sub(r'\s+', ' ', text).strip() return text def process_json_data(json_data): result_dict = {} for doc in json_data: # Extract wikipedia_id and text wikipedia_id = doc.get("wikipedia_id") text = doc.get("text", []) # Join the text content and sanitize joined_text = " ".join(text) sanitized_text = sanitize_text(joined_text) # Store in the dictionary result_dict[wikipedia_id] = sanitized_text return result_dict def process_queries(json_data): """ Processes a JSON object containing queries and query IDs. Args: json_data (dict): The input JSON data. Returns: dict: A dictionary with query_id as the key and query text as the value. """ result_dict = {} for query_id, query_info in json_data.items(): # Extract the query input query_text = query_info.get("input", "") # Store query_id and text in the result dictionary result_dict[query_id] = query_text return result_dict # Example usage # Assuming `query_json_file` contains your JSON data # processed_queries = process_queries(query_json_file)