Bot_Development / core /parser.py
dsmultimedika's picture
Improve the code bot development
d57efd6
raw
history blame
3.34 kB
import re
def parse_topics_to_dict(text):
topics = {}
lines = text.strip().split("\n")
current_topic = None
topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
sub_topic_pattern = re.compile(r"^\*\s+(.*)$")
for line in lines:
line = line.strip()
if topic_pattern.match(line):
current_topic = topic_pattern.match(line).group(1)
topics[current_topic] = []
elif sub_topic_pattern.match(line):
sub_topic = sub_topic_pattern.match(line).group(1)
if current_topic:
topics[current_topic].append(sub_topic)
print(topics)
return topics
def remove_all_sources(text):
# Construct a regular expression pattern to match all sources
pattern = r"Source \d+:(.*?)(?=Source \d+:|$)"
# Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
updated_text = re.sub(pattern, "", text, flags=re.DOTALL)
return updated_text.strip()
def clean_text(text):
# Replace multiple spaces with a single space
text = re.sub(r"\s{2,}", " ", text)
# Remove newline characters that are not followed by a number (to keep lists or numbered points)
text = re.sub(r"\n(?!\s*\d)", " ", text)
# Remove unnecessary punctuation (optional, adjust as needed)
text = re.sub(r";(?=\S)", "", text)
# Optional: Remove extra spaces around certain characters
text = re.sub(r"\s*([,;])\s*", r"\1 ", text)
# Normalize whitespace to a single space
text = re.sub(r"\s+", " ", text).strip()
return text
def update_response(text):
# Find all the references in the text, e.g., [1], [3], [5]
responses = re.findall(r"\[\d+\]", text)
# Extract the numbers from the responses, and remove duplicates
ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))
# Create a mapping from old reference numbers to new ones
ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}
# Replace old responses with the updated responses in the text
for old, new in ref_mapping.items():
text = re.sub(rf"\[{old}\]", f"[{new}]", text)
return text
def renumber_sources(source_list):
new_sources = []
for i, source in enumerate(source_list):
# Extract the content after the colon
content = source.split(": ", 1)[1]
# Add the new source number and content
new_sources.append(f"source {i+1}: {content}")
return new_sources
def seperate_to_list(text):
# Step 1: Split the text by line breaks (\n)
lines = text.split("\n")
# Step 2: Remove occurrences of "source (number):"
cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]
# Step 3: Split all capital sentences
final_output = []
for line in cleaned_lines:
# Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
split_line = re.split(r"([A-Z\s]+[.!?])", line)
final_output.extend([part.strip() for part in split_line if part.strip()])
return final_output
def join_list(items):
if not items:
return ""
elif len(items) == 1:
return items[0]
elif len(items) == 2:
return f"{items[0]} and {items[1]}"
else:
return ", ".join(items[:-1]) + " and " + items[-1]