Spaces:
Build error
Build error
File size: 3,008 Bytes
d97a6fa 085b39c d97a6fa 085b39c d97a6fa 085b39c d97a6fa 085b39c d97a6fa 085b39c d97a6fa 085b39c d97a6fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from bs4 import BeautifulSoup
from urllib import request
from bot.web_scrapping.searchable_index import SearchableIndex
from bot.utils.show_log import logger
from bot.utils.constanst import set_api_key
import pandas as pd
import requests
import os
set_api_key(api_key='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')
def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
file_path = os.path.join(output_folder, f"combined_content.{file_format}")
write_functions = {
'txt': lambda: write_text(file_path, text),
'pdf': lambda: write_pdf(url, file_path),
'csv': lambda: write_csv(file_path, text),
'xml': lambda: write_xml(file_path, text)
}
write_function = write_functions.get(file_format)
if write_function:
write_function()
logger.info(f"Content appended to {file_path}")
else:
logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")
return file_path
def write_text(file_path, text):
with open(file_path, "a", encoding="utf-8") as file:
for t in text:
file.write(f'{t.text}\n')
def write_pdf(url, file_path):
request.urlretrieve(url, file_path)
def write_csv(file_path, text):
df = pd.DataFrame({'Content': [t.text for t in text]})
df.to_csv(file_path, mode='a', index=False, header=False)
def write_xml(file_path, text):
xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
with open(file_path, "a", encoding="utf-8") as file:
file.write(xml_content)
def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
if url == 'NO_URL':
file_path = output_folder
else:
responses = requests.get(url)
if responses.status_code != 200:
logger.warning("Failed to retrieve content from the URL.")
return None
if not os.path.exists(output_folder):
os.makedirs(output_folder)
soup = BeautifulSoup(responses.text, "html.parser")
text = soup.find_all(['h2', 'p', 'i', 'ul'])
file_path = save_content_to_file(text=text, url=url, output_folder=output_folder, file_format=file_format)
index = SearchableIndex.embed_index(url=url, path=file_path, llm=llm, prompt=prompt)
if url != 'NO_URL' and os.path.isfile(file_path):
os.remove(file_path)
return index
if __name__ == '__main__':
pass
# Example usage:
# First URL
# idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
#
# Second URL (appends content to existing files)
# idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
# # example get response chatbot
# prompt = 'explain the paper'
# llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
# response = SearchableIndex.query(prompt, llm, idx)
# print(response)
# logger.info(response)
|