File size: 2,773 Bytes
e5f6996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy
import glob
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredFileLoader



def retrieve_sources():
    # Die URL der Webseite, die du scrapen möchtest
    base_url = 'https://www.fimohealth.com'
    url = 'https://www.fimohealth.com/categories/long-covid/'

    # Die Anfrage an die Webseite senden
    response = requests.get(url)

    # Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
    if response.status_code == 200:
        # Den HTML-Inhalt der Webseite parsen
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
        # Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
        links = soup.find_all('a')
        
        urls = []
        # Die gefundenen Links ausgeben
        for link in links:
            if "/gesundheitsblog/" in link.get('href'):
                complete_url = base_url + link.get('href')
                urls.append(complete_url)
    else:
        print('Fehler beim Abrufen der Webseite:', response.status_code)

    return urls

def retrieve_content(url):
    def clean_article(text):
        # Find the index of the word "Zurück"
        index = text.find("Zurück")

        # Extract the substring that comes after "Zurück"
        substring = text[index + len("Zurück"):].strip()
        return substring
    # Send a GET request to the webpage
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main elements you want to retrieve
        main_elements = soup.find_all('main')
        page_content = ""
        # Iterate over the main elements and print their text content
        for element in main_elements:
            page_content += element.text
        
        cleaned_page_content = clean_article(page_content)
        return cleaned_page_content
    else:
        print('Failed to retrieve the webpage')

urls = retrieve_sources()
# manually added
urls.append("https://www.fimohealth.com/patienten")
urls.append("https://www.fimohealth.com/patienten/long-covid")

documents = []
for index, url in enumerate(urls):
    content = retrieve_content(url)
    documents.append(Document(page_content=content, metadata={"source": url}))

# Get all the filenames from the docs folder
files = glob.glob("./docs/*.txt")

# Load files into readable documents
for file in files:
    loader = UnstructuredFileLoader(file)
    documents.append(loader.load()[0])