File size: 6,626 Bytes
e5f6996
 
 
 
 
 
dd156bd
 
982eb2a
 
e5f6996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
982eb2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5f6996
 
 
 
 
 
 
982eb2a
 
 
 
e5f6996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd156bd
 
 
 
 
 
 
 
 
 
e5f6996
dd156bd
 
e5f6996
dd156bd
 
 
 
982eb2a
 
 
 
 
e5f6996
dd156bd
 
 
 
5c2f552
e5f6996
dd156bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5f6996
dd156bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import numpy
import glob
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredFileLoader
import json
import pandas as pd
import re
from langchain_text_splitters import HTMLHeaderTextSplitter


def retrieve_sources():
    # Die URL der Webseite, die du scrapen möchtest
    base_url = 'https://www.fimohealth.com'
    url = 'https://www.fimohealth.com/categories/long-covid/'

    # Die Anfrage an die Webseite senden
    response = requests.get(url)

    # Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
    if response.status_code == 200:
        # Den HTML-Inhalt der Webseite parsen
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
        # Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
        links = soup.find_all('a')
        
        urls = []
        # Die gefundenen Links ausgeben
        for link in links:
            if "/gesundheitsblog/" in link.get('href'):
                complete_url = base_url + link.get('href')
                urls.append(complete_url)
    else:
        print('Fehler beim Abrufen der Webseite:', response.status_code)

    return urls

def html_to_chunks():
    urls = retrieve_sources()
    docs = []
    for url in urls:
        # Assuming urls is a list of URLs and you want to fetch the content of the 5th URL
        response = requests.get(url)

        # Try decoding with different encodings until you find the correct one
        encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
        for encoding in encodings_to_try:
            try:
                content = response.content.decode(encoding)
                break
            except UnicodeDecodeError:
                continue

        # Parse the content using Beautiful Soup
        #soup = BeautifulSoup(content, 'html.parser')

        # Now you can navigate and extract data from the parsed HTML using Beautiful Soup

        soup = BeautifulSoup(response.content, 'html.parser')
        html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0])
        def clean_article(text):
            # Find the index of the word "Zurück"
            index = text.find("Zurück")

            # Extract the substring that comes after "Zurück"
            substring = text[index + len("Zurück"):].strip()

            # Ersetze ":in" durch "*in"
            substring = re.sub(r':in', r'\*in', text)
            
            return substring

        html_string = clean_article(html_string)

        headers_to_split_on = [
            ("h1", "Header 1"),
            ("h2", "Header 2"),
            ("h3", "Header 3"),
            ("h4", "Header 4"),
            ("h5", "Header 5"),
        ]

        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        chunks = html_splitter.split_text(html_string)
        for chunk in chunks:
            chunk.metadata["source"] = url
            docs.append(chunk)
    return docs

def retrieve_content(url):
    def clean_article(text):
        # Find the index of the word "Zurück"
        index = text.find("Zurück")

        # Extract the substring that comes after "Zurück"
        substring = text[index + len("Zurück"):].strip()

        # Ersetze ":in" durch "*in"
        substring = re.sub(r':in', '\*in', text)
        
        return substring
    # Send a GET request to the webpage
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main elements you want to retrieve
        main_elements = soup.find_all('main')
        page_content = ""
        # Iterate over the main elements and print their text content
        for element in main_elements:
            page_content += element.text
        
        cleaned_page_content = clean_article(page_content)
        return cleaned_page_content
    else:
        print('Failed to retrieve the webpage')

def create_documents():
    urls = retrieve_sources()
    # manually added
    urls.append("https://www.fimohealth.com/patienten")
    urls.append("https://www.fimohealth.com/patienten/long-covid")

    documents = []
    for index, url in enumerate(urls):
        content = retrieve_content(url)
        documents.append(Document(page_content=content, metadata={"source": url}))

    # Get all the filenames from the docs folder
    files = glob.glob("./docs/*.txt")

    # Load files into readable documents
    for file in files:
        loader = UnstructuredFileLoader(file)
        documents.append(loader.load()[0])
    
    if len(documents) > 0:
        return documents
    else:
        return TypeError

def create_faq_documents():
    documents = []
    df = pd.read_csv('./docs/faq.csv', sep=",")
    for i, j in df.iterrows():
        documents.append(Document(page_content=f"{j['Title']} \n {j['Text - de']}", metadata={"source": f"FAQ - {j['Bereich']}"}))
    
    document_dicts = [doc.__dict__ for doc in documents]

    # Write all documents to a single JSON file
    file_path = './docs/faq_docs.json'
    with open(file_path, "w") as file:
        json.dump(document_dicts, file, indent=4)

    print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")

def store_documents(documents, path="./docs/langchain_documents.json"):
    # Convert each LangchainDocument object to a dictionary
    document_dicts = [doc.__dict__ for doc in documents]

    # Write all documents to a single JSON file
    file_path = path
    with open(file_path, "w") as file:
        json.dump(document_dicts, file, indent=4)

    print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")

def read_documents_from_file(file_path="./docs/langchain_documents.json"):
    documents = []
    try:
        with open(file_path, "r") as file:
            document_dicts = json.load(file)
            for doc_dict in document_dicts:
                document = Document(**doc_dict)
                documents.append(document)
        print(f"Successfully read {len(documents)} documents from '{file_path}'.")
        return documents
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return []
    except Exception as e:
        print(f"Error reading documents from '{file_path}': {e}")
        return []