File size: 7,576 Bytes
4279593
 
 
 
 
 
 
08c6a9b
4279593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85a4a41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4279593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import re
import gc
import torch
import transformers
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

ENV_FILE_PATH = os.path.join(os.getenv("WRITABLE_DIR", "/tmp"), ".env")

def remove_markdown(text: str) -> str:
        # Remove code block format type and the code block itself
        text = re.sub(r'```[a-zA-Z]*\n', '', text)  # Remove the format type line
        text = re.sub(r'```', '', text)  # Remove remaining backticks for code blocks
        
        # Remove headers
        text = re.sub(r'^\s*#+\s+', '', text, flags=re.MULTILINE)
        
        # Remove bold and italic
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
        text = re.sub(r'__(.*?)__', r'\1', text)
        text = re.sub(r'\*(.*?)\*', r'\1', text)
        text = re.sub(r'_(.*?)_', r'\1', text)
        
        # Remove strikethrough
        text = re.sub(r'~~(.*?)~~', r'\1', text)
        
        # Remove inline code
        text = re.sub(r'`(.*?)`', r'\1', text)
        
        # Remove links
        text = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', text)
        
        # Remove images
        text = re.sub(r'!\[(.*?)\]\((.*?)\)', '', text)
        
        # Remove blockquotes
        text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
        
        # Remove lists
        text = re.sub(r'^\s*[\*\+-]\s+', '', text, flags=re.MULTILINE)
        text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
        
        # Remove horizontal lines
        text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
        
        # Remove any remaining markdown symbols
        text = re.sub(r'[*_~`]', '', text)
        
        return text.strip()

def remove_outer_markdown_block(chunk, _acc={"b":""}):
    _acc["b"] += chunk
    p = re.compile(r'```markdown\s*\n(.*?)\n?```', re.DOTALL|re.IGNORECASE)
    o = []

    while True:
        m = p.search(_acc["b"])
        if not m:
             break
        
        s,e = m.span()
        o.append(_acc["b"][:s]+m.group(1))
        _acc["b"] = _acc["b"][e:]

    if '```markdown' not in _acc["b"].lower():
        o.append(_acc["b"])
        _acc["b"] = ""

    return "".join(o)

def clear_gpu_memory():
        # Clear GPU memory and cache if available
        if torch.cuda.is_available():
                try:
                        print("Starting the GPU memory cleanup process...")
                        # Clear CUDA cache
                        torch.cuda.empty_cache()
                        # Reset all GPU memory
                        device_count = torch.cuda.device_count()
                        print(f"Number of GPUs: {device_count}")
                        for device_id in range(device_count):
                                print(f"Clearing GPU memory and cache for device {device_id}...")
                                # Set current device before operations
                                torch.cuda.set_device(device_id)
                                torch.cuda.reset_peak_memory_stats(torch.cuda.current_device())
                                torch.cuda.empty_cache()
                        # Force clear any allocated tensors
                        torch.cuda.synchronize()
                        torch.cuda.ipc_collect()
                except Exception as e:
                        raise Exception(f"Error clearing GPU memory and cache: {e}")

def clear_memory():
        # Delete all tensors and models
        print("Deleting all tensors and models...")
        for obj in gc.get_objects():
            try:
                if torch.is_tensor(obj):
                    del obj
                elif isinstance(obj, transformers.PreTrainedModel) or \
                     isinstance(obj, transformers.tokenization_utils_base.PreTrainedTokenizerBase) or \
                     "SentenceTransformer" in str(type(obj)):
                    
                    model_name = ""  # Initialize model name
                    if hasattr(obj, "name_or_path"):
                        model_name = obj.name_or_path
                    elif hasattr(obj, "config") and hasattr(obj.config, "_name_or_path"):
                        model_name = obj.config._name_or_path
                    else:
                        model_name = str(type(obj))  # Fallback to type if name is not found

                    print(f"Deleting model: {model_name}")  # Log the model name
                    del obj
            except Exception as e:
                print(f"Error during deletion: {e}")
        
        gc.collect()  # Run garbage collection

# Function to chunk text
def chunk_text(input_text, max_chunk_length=100, overlap=0, context_length=None):
    # Use context_length if provided, otherwise use max_chunk_length
    chunk_size = context_length if isinstance(context_length, int) and context_length > 0 else max_chunk_length
    
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len
    )
    chunks = splitter.split_text(input_text)

    token_splitter = TokenTextSplitter(chunk_size=max_chunk_length, chunk_overlap=overlap) \
    if not context_length else None
    
    final_chunks = []
    span_annotations = []
    current_position = 0
    
    for chunk in chunks:
        # If token_splitter exists, use it. Otherwise, use the chunk as is
        current_chunks = token_splitter.split_text(chunk) if token_splitter else [chunk]
        final_chunks.extend(current_chunks)
        
        for tc in current_chunks:
            span_annotations.append((current_position, current_position + len(tc)))
            current_position += len(tc)
    
    return final_chunks, span_annotations

# Function to read .env file
def read_env():
    env_dict = {}
    if not os.path.exists(ENV_FILE_PATH):
        return env_dict

    with open(ENV_FILE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "=" in line:
                var, val = line.split("=", 1)
                env_dict[var.strip()] = val.strip()
    return env_dict

# Function to update .env file
def update_env_vars(new_values: dict):
    # Overwrite .env file with new values
    with open(ENV_FILE_PATH, "w", encoding="utf-8") as f:
        for var, val in new_values.items():
            f.write(f"{var}={val}\n")

# Function to prepare provider key updates dictionary
def prepare_provider_key_updates(provider: str, multiline_keys: str) -> dict:
    lines = [ln.strip() for ln in multiline_keys.splitlines() if ln.strip()]
    updates = {}

    if provider == "openai":
        for i, key in enumerate(lines, start=1):
            updates[f"OPENAI_API_KEY_{i}"] = key
    elif provider == "google":
        for i, key in enumerate(lines, start=1):
            updates[f"GOOGLE_API_KEY_{i}"] = key
    elif provider == "xai":
        for i, key in enumerate(lines, start=1):
            updates[f"XAI_API_KEY_{i}"] = key
    elif provider == "anthropic":
        for i, key in enumerate(lines, start=1):
            updates[f"ANTHROPIC_API_KEY_{i}"] = key

    return updates

# Function to prepare proxy list dictionary
def prepare_proxy_list_updates(proxy_list: str) -> list:
    lines = [proxy.strip() for proxy in proxy_list.splitlines() if proxy.strip()]
    proxies = {}

    for i, proxy in enumerate(lines, start=1):
        proxies[f"PROXY_{i}"] = proxy

    return proxies