Spaces:
Sleeping
Sleeping
File size: 3,087 Bytes
9002555 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import re
def parse_topics_to_dict(text):
topics = {}
lines = text.strip().split("\n")
current_topic = None
topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
sub_topic_pattern = re.compile(r"^\*\s+(.*)$")
for line in lines:
line = line.strip()
if topic_pattern.match(line):
current_topic = topic_pattern.match(line).group(1)
topics[current_topic] = []
elif sub_topic_pattern.match(line):
sub_topic = sub_topic_pattern.match(line).group(1)
if current_topic:
topics[current_topic].append(sub_topic)
print(topics)
return topics
def remove_all_sources(text):
# Construct a regular expression pattern to match all sources
pattern = r"Source \d+:(.*?)(?=Source \d+:|$)"
# Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
updated_text = re.sub(pattern, "", text, flags=re.DOTALL)
return updated_text.strip()
def clean_text(text):
# Replace multiple spaces with a single space
text = re.sub(r"\s{2,}", " ", text)
# Remove newline characters that are not followed by a number (to keep lists or numbered points)
text = re.sub(r"\n(?!\s*\d)", " ", text)
# Remove unnecessary punctuation (optional, adjust as needed)
text = re.sub(r";(?=\S)", "", text)
# Optional: Remove extra spaces around certain characters
text = re.sub(r"\s*([,;])\s*", r"\1 ", text)
# Normalize whitespace to a single space
text = re.sub(r"\s+", " ", text).strip()
return text
def update_response(text):
# Find all the references in the text, e.g., [1], [3], [5]
responses = re.findall(r"\[\d+\]", text)
# Extract the numbers from the responses, and remove duplicates
ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))
# Create a mapping from old reference numbers to new ones
ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}
# Replace old responses with the updated responses in the text
for old, new in ref_mapping.items():
text = re.sub(rf"\[{old}\]", f"[{new}]", text)
return text
def renumber_sources(source_list):
new_sources = []
for i, source in enumerate(source_list):
# Extract the content after the colon
content = source.split(": ", 1)[1]
# Add the new source number and content
new_sources.append(f"source {i+1}: {content}")
return new_sources
def seperate_to_list(text):
# Step 1: Split the text by line breaks (\n)
lines = text.split("\n")
# Step 2: Remove occurrences of "source (number):"
cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]
# Step 3: Split all capital sentences
final_output = []
for line in cleaned_lines:
# Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
split_line = re.split(r"([A-Z\s]+[.!?])", line)
final_output.extend([part.strip() for part in split_line if part.strip()])
return final_output
|