Spaces:
Sleeping
Sleeping
import hashlib | |
import datetime | |
import os | |
import uuid | |
# from rag_app.utils import logger | |
# logger = logger.get_console_logger("utils") | |
def extract_urls(data_list): | |
""" | |
Extracts URLs from a list of of dictionaries. | |
Parameters: | |
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'. | |
Returns: | |
- list: A list of URLs extracted from the dictionaries. | |
""" | |
urls = [] | |
print(data_list) | |
for item in data_list: | |
try: | |
# Find the start and end indices of the URL | |
lower_case = item.lower() | |
link_prefix = 'link: ' | |
summary_prefix = ', summary:' | |
start_idx = lower_case.index(link_prefix) + len(link_prefix) | |
end_idx = lower_case.index(summary_prefix, start_idx) | |
# Extract the URL using the indices found | |
url = item[start_idx:end_idx] | |
urls.append(url) | |
except ValueError: | |
# Handles the case where 'link: ' or ', summary:' is not found in the string | |
print("Could not find a URL in the item:", item) | |
last_sources = urls[-3:] | |
return last_sources | |
def format_search_results(search_results): | |
""" | |
Formats a list of dictionaries containing search results into a list of strings. | |
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'. | |
Parameters: | |
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'. | |
Returns: | |
- list: A list of formatted strings based on the search results. | |
""" | |
if len(search_results)>1: | |
formatted_results = [ | |
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i) | |
for i in search_results | |
] | |
return formatted_results | |
def parse_list_to_dicts(items: list) -> list: | |
parsed_items = [] | |
for item in items: | |
# Extract title, link, and summary from each string | |
title_start = item.find('Title: ') + len('Title: ') | |
link_start = item.find('Link: ') + len('Link: ') | |
summary_start = item.find('Summary: ') + len('Summary: ') | |
title_end = item.find(', Link: ') | |
link_end = item.find(', Summary: ') | |
summary_end = len(item) | |
title = item[title_start:title_end] | |
link = item[link_start:link_end] | |
summary = item[summary_start:summary_end] | |
# Use the hash_text function for the hash_id | |
hash_id = hash_text(link) | |
# Construct the dictionary for each item | |
parsed_item = { | |
"url": link, | |
"title": title, | |
"hash_id": hash_id, | |
"summary": summary | |
} | |
parsed_items.append(parsed_item) | |
return parsed_items | |
def hash_text(text: str) -> str: | |
return hashlib.md5(text.encode()).hexdigest() | |
def convert_timestamp_to_datetime(timestamp: str) -> str: | |
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S") | |
def create_folder_if_not_exists(folder_path: str) -> None: | |
""" | |
Create a folder if it doesn't already exist. | |
Args: | |
- folder_path (str): The path of the folder to create. | |
""" | |
if not os.path.exists(folder_path): | |
os.makedirs(folder_path) | |
print(f"Folder '{folder_path}' created.") | |
else: | |
print(f"Folder '{folder_path}' already exists.") | |
def generate_uuid() -> str: | |
""" | |
Generate a UUID (Universally Unique Identifier) and return it as a string. | |
Returns: | |
str: A UUID string. | |
""" | |
return str(uuid.uuid4()) |