File size: 1,874 Bytes
d97a6fa
 
 
 
 
 
 
 
 
 
085b39c
 
 
bad3833
085b39c
 
 
 
 
d97a6fa
 
 
085b39c
d97a6fa
 
 
085b39c
 
 
 
 
 
 
 
 
 
 
 
 
 
d97a6fa
085b39c
d97a6fa
085b39c
 
 
 
 
 
 
 
 
 
 
 
d97a6fa
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
from urllib import request
from bot.web_scrapping.searchable_index import SearchableIndex
from bot.utils.show_log import logger
import requests
import os


def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
    file_path = os.path.join(output_folder, f"combined_content.{file_format}")

    write_functions = {
        'txt': lambda: write_text(file_path, text),
        'pdf': lambda: write_pdf(url, file_path)
    }

    write_function = write_functions.get(file_format)
    if write_function:
        write_function()
        logger.info(f"Content appended to {file_path}")
    else:
        logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")

    return file_path


def write_text(file_path, text):
    with open(file_path, "a", encoding="utf-8") as file:
        for t in text:
            file.write(f'{t.text}\n')


def write_pdf(url, file_path):
    request.urlretrieve(url, file_path)


def content_crawler_and_index(url, llm, prompt, file_format='txt', output_folder='learning_documents'):
    if url == 'NO_URL':
        file_path = output_folder
    else:
        responses = requests.get(url)
        if responses.status_code != 200:
            logger.warning("Failed to retrieve content from the URL.")
            return None
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        soup = BeautifulSoup(responses.text, "html.parser")
        text = soup.find_all(['h2', 'p', 'i', 'ul'])
        file_path = save_content_to_file(text=text, url=url, output_folder=output_folder, file_format=file_format)

    index = SearchableIndex.embed_index(url=url, path=file_path, llm=llm, prompt=prompt)
    if url != 'NO_URL' and os.path.isfile(file_path):
        os.remove(file_path)

    return index


if __name__ == '__main__':
    pass