Spaces:
Runtime error
Runtime error
File size: 2,773 Bytes
e5f6996 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import numpy
import glob
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredFileLoader
def retrieve_sources():
# Die URL der Webseite, die du scrapen möchtest
base_url = 'https://www.fimohealth.com'
url = 'https://www.fimohealth.com/categories/long-covid/'
# Die Anfrage an die Webseite senden
response = requests.get(url)
# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
if response.status_code == 200:
# Den HTML-Inhalt der Webseite parsen
soup = BeautifulSoup(response.text, 'html.parser')
# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
# Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
links = soup.find_all('a')
urls = []
# Die gefundenen Links ausgeben
for link in links:
if "/gesundheitsblog/" in link.get('href'):
complete_url = base_url + link.get('href')
urls.append(complete_url)
else:
print('Fehler beim Abrufen der Webseite:', response.status_code)
return urls
def retrieve_content(url):
def clean_article(text):
# Find the index of the word "Zurück"
index = text.find("Zurück")
# Extract the substring that comes after "Zurück"
substring = text[index + len("Zurück"):].strip()
return substring
# Send a GET request to the webpage
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the webpage
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main elements you want to retrieve
main_elements = soup.find_all('main')
page_content = ""
# Iterate over the main elements and print their text content
for element in main_elements:
page_content += element.text
cleaned_page_content = clean_article(page_content)
return cleaned_page_content
else:
print('Failed to retrieve the webpage')
urls = retrieve_sources()
# manually added
urls.append("https://www.fimohealth.com/patienten")
urls.append("https://www.fimohealth.com/patienten/long-covid")
documents = []
for index, url in enumerate(urls):
content = retrieve_content(url)
documents.append(Document(page_content=content, metadata={"source": url}))
# Get all the filenames from the docs folder
files = glob.glob("./docs/*.txt")
# Load files into readable documents
for file in files:
loader = UnstructuredFileLoader(file)
documents.append(loader.load()[0])
|