Spaces:
Sleeping
Sleeping
import re | |
def parse_topics_to_dict(text): | |
topics = {} | |
lines = text.strip().split("\n") | |
current_topic = None | |
topic_pattern = re.compile(r"^\d+\.\s+(.*)$") | |
sub_topic_pattern = re.compile(r"^\*\s+(.*)$") | |
for line in lines: | |
line = line.strip() | |
if topic_pattern.match(line): | |
current_topic = topic_pattern.match(line).group(1) | |
topics[current_topic] = [] | |
elif sub_topic_pattern.match(line): | |
sub_topic = sub_topic_pattern.match(line).group(1) | |
if current_topic: | |
topics[current_topic].append(sub_topic) | |
print(topics) | |
return topics | |
def remove_all_sources(text): | |
# Construct a regular expression pattern to match all sources | |
pattern = r"Source \d+:(.*?)(?=Source \d+:|$)" | |
# Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching | |
updated_text = re.sub(pattern, "", text, flags=re.DOTALL) | |
return updated_text.strip() | |
def clean_text(text): | |
# Replace multiple spaces with a single space | |
text = re.sub(r"\s{2,}", " ", text) | |
# Remove newline characters that are not followed by a number (to keep lists or numbered points) | |
text = re.sub(r"\n(?!\s*\d)", " ", text) | |
# Remove unnecessary punctuation (optional, adjust as needed) | |
text = re.sub(r";(?=\S)", "", text) | |
# Optional: Remove extra spaces around certain characters | |
text = re.sub(r"\s*([,;])\s*", r"\1 ", text) | |
# Normalize whitespace to a single space | |
text = re.sub(r"\s+", " ", text).strip() | |
return text | |
def update_response(text): | |
# Find all the references in the text, e.g., [1], [3], [5] | |
responses = re.findall(r"\[\d+\]", text) | |
# Extract the numbers from the responses, and remove duplicates | |
ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses)) | |
# Create a mapping from old reference numbers to new ones | |
ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)} | |
# Replace old responses with the updated responses in the text | |
for old, new in ref_mapping.items(): | |
text = re.sub(rf"\[{old}\]", f"[{new}]", text) | |
return text | |
def renumber_sources(source_list): | |
new_sources = [] | |
for i, source in enumerate(source_list): | |
# Extract the content after the colon | |
content = source.split(": ", 1)[1] | |
# Add the new source number and content | |
new_sources.append(f"source {i+1}: {content}") | |
return new_sources | |
def seperate_to_list(text): | |
# Step 1: Split the text by line breaks (\n) | |
lines = text.split("\n") | |
# Step 2: Remove occurrences of "source (number):" | |
cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines] | |
# Step 3: Split all capital sentences | |
final_output = [] | |
for line in cleaned_lines: | |
# Split any fully capitalized sentence (surrounding non-uppercase text remains intact) | |
split_line = re.split(r"([A-Z\s]+[.!?])", line) | |
final_output.extend([part.strip() for part in split_line if part.strip()]) | |
return final_output | |
def join_list(items): | |
if not items: | |
return "" | |
elif len(items) == 1: | |
return items[0] | |
elif len(items) == 2: | |
return f"{items[0]} and {items[1]}" | |
else: | |
return ", ".join(items[:-1]) + " and " + items[-1] | |