import numpy import glob import requests from bs4 import BeautifulSoup from langchain.docstore.document import Document from langchain_community.document_loaders import UnstructuredFileLoader import json import pandas as pd import re from langchain_text_splitters import HTMLHeaderTextSplitter def retrieve_sources(): # Die URL der Webseite, die du scrapen möchtest base_url = 'https://www.fimohealth.com' url = 'https://www.fimohealth.com/categories/long-covid/' # Die Anfrage an die Webseite senden response = requests.get(url) # Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200) if response.status_code == 200: # Den HTML-Inhalt der Webseite parsen soup = BeautifulSoup(response.text, 'html.parser') # Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren # Zum Beispiel, um alle Tags auf der Seite zu finden: links = soup.find_all('a') urls = [] # Die gefundenen Links ausgeben for link in links: if "/gesundheitsblog/" in link.get('href'): complete_url = base_url + link.get('href') urls.append(complete_url) else: print('Fehler beim Abrufen der Webseite:', response.status_code) return urls def html_to_chunks(): urls = retrieve_sources() docs = [] for url in urls: # Assuming urls is a list of URLs and you want to fetch the content of the 5th URL response = requests.get(url) # Try decoding with different encodings until you find the correct one encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1'] for encoding in encodings_to_try: try: content = response.content.decode(encoding) break except UnicodeDecodeError: continue # Parse the content using Beautiful Soup #soup = BeautifulSoup(content, 'html.parser') # Now you can navigate and extract data from the parsed HTML using Beautiful Soup soup = BeautifulSoup(response.content, 'html.parser') html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0]) def clean_article(text): # Find the index of the word "Zurück" index = text.find("Zurück") # Extract the substring that comes after "Zurück" substring = text[index + len("Zurück"):].strip() # Ersetze ":in" durch "*in" substring = re.sub(r':in', r'\*in', text) return substring html_string = clean_article(html_string) headers_to_split_on = [ ("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3"), ("h4", "Header 4"), ("h5", "Header 5"), ] html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) chunks = html_splitter.split_text(html_string) for chunk in chunks: chunk.metadata["source"] = url docs.append(chunk) return docs def retrieve_content(url): def clean_article(text): # Find the index of the word "Zurück" index = text.find("Zurück") # Extract the substring that comes after "Zurück" substring = text[index + len("Zurück"):].strip() # Ersetze ":in" durch "*in" substring = re.sub(r':in', '\*in', text) return substring # Send a GET request to the webpage response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Parse the HTML content of the webpage soup = BeautifulSoup(response.content, 'html.parser') # Find the main elements you want to retrieve main_elements = soup.find_all('main') page_content = "" # Iterate over the main elements and print their text content for element in main_elements: page_content += element.text cleaned_page_content = clean_article(page_content) return cleaned_page_content else: print('Failed to retrieve the webpage') def create_documents(): urls = retrieve_sources() # manually added urls.append("https://www.fimohealth.com/patienten") urls.append("https://www.fimohealth.com/patienten/long-covid") documents = [] for index, url in enumerate(urls): content = retrieve_content(url) documents.append(Document(page_content=content, metadata={"source": url})) # Get all the filenames from the docs folder files = glob.glob("./docs/*.txt") # Load files into readable documents for file in files: loader = UnstructuredFileLoader(file) documents.append(loader.load()[0]) if len(documents) > 0: return documents else: return TypeError def create_faq_documents(): documents = [] df = pd.read_csv('./docs/faq.csv', sep=",") for i, j in df.iterrows(): documents.append(Document(page_content=f"{j['Title']} \n {j['Text - de']}", metadata={"source": f"FAQ - {j['Bereich']}"})) document_dicts = [doc.__dict__ for doc in documents] # Write all documents to a single JSON file file_path = './docs/faq_docs.json' with open(file_path, "w") as file: json.dump(document_dicts, file, indent=4) print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.") def store_documents(documents, path="./docs/langchain_documents.json"): # Convert each LangchainDocument object to a dictionary document_dicts = [doc.__dict__ for doc in documents] # Write all documents to a single JSON file file_path = path with open(file_path, "w") as file: json.dump(document_dicts, file, indent=4) print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.") def read_documents_from_file(file_path="./docs/langchain_documents.json"): documents = [] try: with open(file_path, "r") as file: document_dicts = json.load(file) for doc_dict in document_dicts: document = Document(**doc_dict) documents.append(document) print(f"Successfully read {len(documents)} documents from '{file_path}'.") return documents except FileNotFoundError: print(f"File '{file_path}' not found.") return [] except Exception as e: print(f"Error reading documents from '{file_path}': {e}") return []