Spaces:

hemangthakur
/

seekr

Paused

Hemang Thakur

changed .env file path to writable directory

08c6a9b 4 months ago

7.58 kB

	import os
	import re
	import gc
	import torch
	import transformers
	from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

	ENV_FILE_PATH = os.path.join(os.getenv("WRITABLE_DIR", "/tmp"), ".env")

	def remove_markdown(text: str) -> str:
	# Remove code block format type and the code block itself
	text = re.sub(r'```[a-zA-Z]*\n', '', text) # Remove the format type line
	text = re.sub(r'```', '', text) # Remove remaining backticks for code blocks

	# Remove headers
	text = re.sub(r'^\s*#+\s+', '', text, flags=re.MULTILINE)

	# Remove bold and italic
	text = re.sub(r'\\(.?)\\*', r'\1', text)
	text = re.sub(r'__(.*?)__', r'\1', text)
	text = re.sub(r'\(.?)\*', r'\1', text)
	text = re.sub(r'_(.*?)_', r'\1', text)

	# Remove strikethrough
	text = re.sub(r'~~(.*?)~~', r'\1', text)

	# Remove inline code
	text = re.sub(r'`(.*?)`', r'\1', text)

	# Remove links
	text = re.sub(r'\[(.?)\]$(.?)$', r'\1', text)

	# Remove images
	text = re.sub(r'!\[(.?)\]$(.?)$', '', text)

	# Remove blockquotes
	text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)

	# Remove lists
	text = re.sub(r'^\s[\\+-]\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)

	# Remove horizontal lines
	text = re.sub(r'^\s[-_]{3,}\s*$', '', text, flags=re.MULTILINE)

	# Remove any remaining markdown symbols
	text = re.sub(r'[*_~`]', '', text)

	return text.strip()

	def remove_outer_markdown_block(chunk, _acc={"b":""}):
	_acc["b"] += chunk
	p = re.compile(r'```markdown\s\n(.?)\n?```', re.DOTALL\|re.IGNORECASE)
	o = []

	while True:
	m = p.search(_acc["b"])
	if not m:
	break

	s,e = m.span()
	o.append(_acc["b"][:s]+m.group(1))
	_acc["b"] = _acc["b"][e:]

	if '```markdown' not in _acc["b"].lower():
	o.append(_acc["b"])
	_acc["b"] = ""

	return "".join(o)

	def clear_gpu_memory():
	# Clear GPU memory and cache if available
	if torch.cuda.is_available():
	try:
	print("Starting the GPU memory cleanup process...")
	# Clear CUDA cache
	torch.cuda.empty_cache()
	# Reset all GPU memory
	device_count = torch.cuda.device_count()
	print(f"Number of GPUs: {device_count}")
	for device_id in range(device_count):
	print(f"Clearing GPU memory and cache for device {device_id}...")
	# Set current device before operations
	torch.cuda.set_device(device_id)
	torch.cuda.reset_peak_memory_stats(torch.cuda.current_device())
	torch.cuda.empty_cache()
	# Force clear any allocated tensors
	torch.cuda.synchronize()
	torch.cuda.ipc_collect()
	except Exception as e:
	raise Exception(f"Error clearing GPU memory and cache: {e}")

	def clear_memory():
	# Delete all tensors and models
	print("Deleting all tensors and models...")
	for obj in gc.get_objects():
	try:
	if torch.is_tensor(obj):
	del obj
	elif isinstance(obj, transformers.PreTrainedModel) or \
	isinstance(obj, transformers.tokenization_utils_base.PreTrainedTokenizerBase) or \
	"SentenceTransformer" in str(type(obj)):

	model_name = "" # Initialize model name
	if hasattr(obj, "name_or_path"):
	model_name = obj.name_or_path
	elif hasattr(obj, "config") and hasattr(obj.config, "_name_or_path"):
	model_name = obj.config._name_or_path
	else:
	model_name = str(type(obj)) # Fallback to type if name is not found

	print(f"Deleting model: {model_name}") # Log the model name
	del obj
	except Exception as e:
	print(f"Error during deletion: {e}")

	gc.collect() # Run garbage collection

	# Function to chunk text
	def chunk_text(input_text, max_chunk_length=100, overlap=0, context_length=None):
	# Use context_length if provided, otherwise use max_chunk_length
	chunk_size = context_length if isinstance(context_length, int) and context_length > 0 else max_chunk_length

	splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n", ". ", " ", ""],
	chunk_size=chunk_size,
	chunk_overlap=overlap,
	length_function=len
	)
	chunks = splitter.split_text(input_text)

	token_splitter = TokenTextSplitter(chunk_size=max_chunk_length, chunk_overlap=overlap) \
	if not context_length else None

	final_chunks = []
	span_annotations = []
	current_position = 0

	for chunk in chunks:
	# If token_splitter exists, use it. Otherwise, use the chunk as is
	current_chunks = token_splitter.split_text(chunk) if token_splitter else [chunk]
	final_chunks.extend(current_chunks)

	for tc in current_chunks:
	span_annotations.append((current_position, current_position + len(tc)))
	current_position += len(tc)

	return final_chunks, span_annotations

	# Function to read .env file
	def read_env():
	env_dict = {}
	if not os.path.exists(ENV_FILE_PATH):
	return env_dict

	with open(ENV_FILE_PATH, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line or line.startswith("#"):
	continue
	if "=" in line:
	var, val = line.split("=", 1)
	env_dict[var.strip()] = val.strip()
	return env_dict

	# Function to update .env file
	def update_env_vars(new_values: dict):
	# Overwrite .env file with new values
	with open(ENV_FILE_PATH, "w", encoding="utf-8") as f:
	for var, val in new_values.items():
	f.write(f"{var}={val}\n")

	# Function to prepare provider key updates dictionary
	def prepare_provider_key_updates(provider: str, multiline_keys: str) -> dict:
	lines = [ln.strip() for ln in multiline_keys.splitlines() if ln.strip()]
	updates = {}

	if provider == "openai":
	for i, key in enumerate(lines, start=1):
	updates[f"OPENAI_API_KEY_{i}"] = key
	elif provider == "google":
	for i, key in enumerate(lines, start=1):
	updates[f"GOOGLE_API_KEY_{i}"] = key
	elif provider == "xai":
	for i, key in enumerate(lines, start=1):
	updates[f"XAI_API_KEY_{i}"] = key
	elif provider == "anthropic":
	for i, key in enumerate(lines, start=1):
	updates[f"ANTHROPIC_API_KEY_{i}"] = key

	return updates

	# Function to prepare proxy list dictionary
	def prepare_proxy_list_updates(proxy_list: str) -> list:
	lines = [proxy.strip() for proxy in proxy_list.splitlines() if proxy.strip()]
	proxies = {}

	for i, proxy in enumerate(lines, start=1):
	proxies[f"PROXY_{i}"] = proxy

	return proxies