import re def parse_topics_to_dict(text): topics = {} lines = text.strip().split("\n") current_topic = None topic_pattern = re.compile(r"^\d+\.\s+(.*)$") sub_topic_pattern = re.compile(r"^\*\s+(.*)$") for line in lines: line = line.strip() if topic_pattern.match(line): current_topic = topic_pattern.match(line).group(1) topics[current_topic] = [] elif sub_topic_pattern.match(line): sub_topic = sub_topic_pattern.match(line).group(1) if current_topic: topics[current_topic].append(sub_topic) print(topics) return topics def remove_all_sources(text): # Construct a regular expression pattern to match all sources pattern = r"Source \d+:(.*?)(?=Source \d+:|$)" # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching updated_text = re.sub(pattern, "", text, flags=re.DOTALL) return updated_text.strip() def clean_text(text): # Replace multiple spaces with a single space text = re.sub(r"\s{2,}", " ", text) # Remove newline characters that are not followed by a number (to keep lists or numbered points) text = re.sub(r"\n(?!\s*\d)", " ", text) # Remove unnecessary punctuation (optional, adjust as needed) text = re.sub(r";(?=\S)", "", text) # Optional: Remove extra spaces around certain characters text = re.sub(r"\s*([,;])\s*", r"\1 ", text) # Normalize whitespace to a single space text = re.sub(r"\s+", " ", text).strip() return text def update_response(text): # Find all the references in the text, e.g., [1], [3], [5] responses = re.findall(r"\[\d+\]", text) # Extract the numbers from the responses, and remove duplicates ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses)) # Create a mapping from old reference numbers to new ones ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)} # Replace old responses with the updated responses in the text for old, new in ref_mapping.items(): text = re.sub(rf"\[{old}\]", f"[{new}]", text) return text def renumber_sources(source_list): new_sources = [] for i, source in enumerate(source_list): # Extract the content after the colon content = source.split(": ", 1)[1] # Add the new source number and content new_sources.append(f"source {i+1}: {content}") return new_sources def sort_and_renumber_sources(source_list): """ This function takes a list of sources, sorts them based on the source number, and renumbers them sequentially starting from 1. :param source_list: List of strings containing source information. :return: Sorted and renumbered list of sources. """ # Function to extract source number def extract_source_number(source): match = re.search(r"Source (\d+)", source) return int(match.group(1)) if match else float('inf') # Sort sources based on the source number sorted_sources = sorted(source_list, key=extract_source_number) # Reassign the numbering in the sorted sources for idx, source in enumerate(sorted_sources, 1): sorted_sources[idx-1] = re.sub(r"Source \d+", f"Source {idx}", source) return sorted_sources def seperate_to_list(text): # Step 1: Split the text by line breaks (\n) lines = text.split("\n") # Step 2: Remove occurrences of "source (number):" cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines] # Step 3: Split all capital sentences final_output = [] for line in cleaned_lines: # Split any fully capitalized sentence (surrounding non-uppercase text remains intact) split_line = re.split(r"([A-Z\s]+[.!?])", line) final_output.extend([part.strip() for part in split_line if part.strip()]) return final_output def join_list(items): if not items: return "" elif len(items) == 1: return items[0] elif len(items) == 2: return f"{items[0]} and {items[1]}" else: return ", ".join(items[:-1]) + " and " + items[-1] def redesign_structure_message(message, metadata): """ This function replaces occurrences of '[n]' in the message with the title of the book found in metadata[n-1]["title"]. """ if not metadata or metadata == []: return message # Return the original message if metadata is not valid # Create a function to replace each citation with the corresponding book title def replace_citation(match): citation_number = int(match.group(1)) # Extract the citation number # Check if the citation number corresponds to a title in metadata if 1 <= citation_number <= len(metadata): return f"[*{metadata[citation_number - 1]['title']}*]" # Return the title in italics return match.group(0) # Return the original citation if out of bounds # Use regex to find all citations in the format '[n]' redesigned_message = re.sub(r'\[(\d+)\]', replace_citation, message) return redesigned_message def extract_sorted_page_numbers(content): # Regular expression pattern to match page references like [p-166], [p-163], etc. page_pattern = r'\[p-(\d+)\]' # Find all matches (page numbers) in the content page_numbers = re.findall(page_pattern, content) # Convert the found page numbers into integers, remove duplicates, and sort them return sorted(set(map(int, page_numbers))) # Use set to remove duplicates and sorted to sort them # Method to filter and create a new list with the relevant page numbers [163, 165, 166] def filter_metadata_by_pages(metadata, pages): if pages and metadata: combined_metadata = [{ "page_number": pages, "title": metadata[0]["title"], # All entries share the same title "author": metadata[0]["author"], # All entries share the same author "category": metadata[0]["category"], # All entries share the same category "year": metadata[0]["year"], # All entries share the same year "publisher": metadata[0]["publisher"], # All entries share the same publisher "reference": metadata[0]["reference"] # All entries share the same reference }] return combined_metadata else: return []