Spaces:
Runtime error
Runtime error
import numpy | |
import glob | |
import requests | |
from bs4 import BeautifulSoup | |
from langchain.docstore.document import Document | |
from langchain_community.document_loaders import UnstructuredFileLoader | |
def retrieve_sources(): | |
# Die URL der Webseite, die du scrapen möchtest | |
base_url = 'https://www.fimohealth.com' | |
url = 'https://www.fimohealth.com/categories/long-covid/' | |
# Die Anfrage an die Webseite senden | |
response = requests.get(url) | |
# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200) | |
if response.status_code == 200: | |
# Den HTML-Inhalt der Webseite parsen | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren | |
# Zum Beispiel, um alle <a> Tags auf der Seite zu finden: | |
links = soup.find_all('a') | |
urls = [] | |
# Die gefundenen Links ausgeben | |
for link in links: | |
if "/gesundheitsblog/" in link.get('href'): | |
complete_url = base_url + link.get('href') | |
urls.append(complete_url) | |
else: | |
print('Fehler beim Abrufen der Webseite:', response.status_code) | |
return urls | |
def retrieve_content(url): | |
def clean_article(text): | |
# Find the index of the word "Zurück" | |
index = text.find("Zurück") | |
# Extract the substring that comes after "Zurück" | |
substring = text[index + len("Zurück"):].strip() | |
return substring | |
# Send a GET request to the webpage | |
response = requests.get(url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content of the webpage | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the main elements you want to retrieve | |
main_elements = soup.find_all('main') | |
page_content = "" | |
# Iterate over the main elements and print their text content | |
for element in main_elements: | |
page_content += element.text | |
cleaned_page_content = clean_article(page_content) | |
return cleaned_page_content | |
else: | |
print('Failed to retrieve the webpage') | |
urls = retrieve_sources() | |
# manually added | |
urls.append("https://www.fimohealth.com/patienten") | |
urls.append("https://www.fimohealth.com/patienten/long-covid") | |
documents = [] | |
for index, url in enumerate(urls): | |
content = retrieve_content(url) | |
documents.append(Document(page_content=content, metadata={"source": url})) | |
# Get all the filenames from the docs folder | |
files = glob.glob("./docs/*.txt") | |
# Load files into readable documents | |
for file in files: | |
loader = UnstructuredFileLoader(file) | |
documents.append(loader.load()[0]) | |