Spaces:
Build error
Build error
from bs4 import BeautifulSoup | |
from urllib import request | |
from bot.web_scrapping.searchable_index import SearchableIndex | |
from bot.utils.show_log import logger | |
from bot.utils.constanst import set_api_key | |
import pandas as pd | |
import requests | |
import os | |
set_api_key(api_key='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') | |
def save_content_to_file(url=None, text=None, output_folder=None, file_format=None): | |
file_path = os.path.join(output_folder, f"combined_content.{file_format}") | |
write_functions = { | |
'txt': lambda: write_text(file_path, text), | |
'pdf': lambda: write_pdf(url, file_path), | |
'csv': lambda: write_csv(file_path, text), | |
'xml': lambda: write_xml(file_path, text) | |
} | |
write_function = write_functions.get(file_format) | |
if write_function: | |
write_function() | |
logger.info(f"Content appended to {file_path}") | |
else: | |
logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml") | |
return file_path | |
def write_text(file_path, text): | |
with open(file_path, "a", encoding="utf-8") as file: | |
for t in text: | |
file.write(f'{t.text}\n') | |
def write_pdf(url, file_path): | |
request.urlretrieve(url, file_path) | |
def write_csv(file_path, text): | |
df = pd.DataFrame({'Content': [t.text for t in text]}) | |
df.to_csv(file_path, mode='a', index=False, header=False) | |
def write_xml(file_path, text): | |
xml_content = ''.join([f'<item>{t.text}</item>' for t in text]) | |
with open(file_path, "a", encoding="utf-8") as file: | |
file.write(xml_content) | |
def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'): | |
if url == 'NO_URL': | |
file_path = output_folder | |
else: | |
responses = requests.get(url) | |
if responses.status_code != 200: | |
logger.warning("Failed to retrieve content from the URL.") | |
return None | |
if not os.path.exists(output_folder): | |
os.makedirs(output_folder) | |
soup = BeautifulSoup(responses.text, "html.parser") | |
text = soup.find_all(['h2', 'p', 'i', 'ul']) | |
file_path = save_content_to_file(text=text, url=url, output_folder=output_folder, file_format=file_format) | |
index = SearchableIndex.embed_index(url=url, path=file_path, llm=llm, prompt=prompt) | |
if url != 'NO_URL' and os.path.isfile(file_path): | |
os.remove(file_path) | |
return index | |
if __name__ == '__main__': | |
pass | |
# Example usage: | |
# First URL | |
# idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt') | |
# | |
# Second URL (appends content to existing files) | |
# idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf') | |
# # example get response chatbot | |
# prompt = 'explain the paper' | |
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0) | |
# response = SearchableIndex.query(prompt, llm, idx) | |
# print(response) | |
# logger.info(response) | |