File size: 3,087 Bytes
9002555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re


def parse_topics_to_dict(text):
    topics = {}
    lines = text.strip().split("\n")
    current_topic = None

    topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
    sub_topic_pattern = re.compile(r"^\*\s+(.*)$")

    for line in lines:
        line = line.strip()
        if topic_pattern.match(line):
            current_topic = topic_pattern.match(line).group(1)
            topics[current_topic] = []
        elif sub_topic_pattern.match(line):
            sub_topic = sub_topic_pattern.match(line).group(1)
            if current_topic:
                topics[current_topic].append(sub_topic)

    print(topics)
    return topics


def remove_all_sources(text):
    # Construct a regular expression pattern to match all sources
    pattern = r"Source \d+:(.*?)(?=Source \d+:|$)"

    # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
    updated_text = re.sub(pattern, "", text, flags=re.DOTALL)

    return updated_text.strip()


def clean_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r"\s{2,}", " ", text)
    # Remove newline characters that are not followed by a number (to keep lists or numbered points)
    text = re.sub(r"\n(?!\s*\d)", " ", text)
    # Remove unnecessary punctuation (optional, adjust as needed)
    text = re.sub(r";(?=\S)", "", text)
    # Optional: Remove extra spaces around certain characters
    text = re.sub(r"\s*([,;])\s*", r"\1 ", text)
    # Normalize whitespace to a single space
    text = re.sub(r"\s+", " ", text).strip()

    return text


def update_response(text):
    # Find all the references in the text, e.g., [1], [3], [5]
    responses = re.findall(r"\[\d+\]", text)

    # Extract the numbers from the responses, and remove duplicates
    ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))

    # Create a mapping from old reference numbers to new ones
    ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}

    # Replace old responses with the updated responses in the text
    for old, new in ref_mapping.items():
        text = re.sub(rf"\[{old}\]", f"[{new}]", text)

    return text


def renumber_sources(source_list):
    new_sources = []
    for i, source in enumerate(source_list):
        # Extract the content after the colon
        content = source.split(": ", 1)[1]
        # Add the new source number and content
        new_sources.append(f"source {i+1}: {content}")
    return new_sources


def seperate_to_list(text):
    # Step 1: Split the text by line breaks (\n)
    lines = text.split("\n")

    # Step 2: Remove occurrences of "source (number):"
    cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]

    # Step 3: Split all capital sentences
    final_output = []
    for line in cleaned_lines:
        # Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
        split_line = re.split(r"([A-Z\s]+[.!?])", line)
        final_output.extend([part.strip() for part in split_line if part.strip()])

    return final_output