Spaces:

timFimo
/

VolkerChat

Runtime error

App Files Files

VolkerChat / documents.py

timfe

added more articles, improved citations

e5f6996 over 1 year ago

raw

history blame

2.77 kB

	import numpy
	import glob
	import requests
	from bs4 import BeautifulSoup
	from langchain.docstore.document import Document
	from langchain_community.document_loaders import UnstructuredFileLoader



	def retrieve_sources():
	# Die URL der Webseite, die du scrapen möchtest
	base_url = 'https://www.fimohealth.com'
	url = 'https://www.fimohealth.com/categories/long-covid/'

	# Die Anfrage an die Webseite senden
	response = requests.get(url)

	# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
	if response.status_code == 200:
	# Den HTML-Inhalt der Webseite parsen
	soup = BeautifulSoup(response.text, 'html.parser')

	# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
	# Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
	links = soup.find_all('a')

	urls = []
	# Die gefundenen Links ausgeben
	for link in links:
	if "/gesundheitsblog/" in link.get('href'):
	complete_url = base_url + link.get('href')
	urls.append(complete_url)
	else:
	print('Fehler beim Abrufen der Webseite:', response.status_code)

	return urls

	def retrieve_content(url):
	def clean_article(text):
	# Find the index of the word "Zurück"
	index = text.find("Zurück")

	# Extract the substring that comes after "Zurück"
	substring = text[index + len("Zurück"):].strip()
	return substring
	# Send a GET request to the webpage
	response = requests.get(url)
	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content of the webpage
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the main elements you want to retrieve
	main_elements = soup.find_all('main')
	page_content = ""
	# Iterate over the main elements and print their text content
	for element in main_elements:
	page_content += element.text

	cleaned_page_content = clean_article(page_content)
	return cleaned_page_content
	else:
	print('Failed to retrieve the webpage')

	urls = retrieve_sources()
	# manually added
	urls.append("https://www.fimohealth.com/patienten")
	urls.append("https://www.fimohealth.com/patienten/long-covid")

	documents = []
	for index, url in enumerate(urls):
	content = retrieve_content(url)
	documents.append(Document(page_content=content, metadata={"source": url}))

	# Get all the filenames from the docs folder
	files = glob.glob("./docs/*.txt")

	# Load files into readable documents
	for file in files:
	loader = UnstructuredFileLoader(file)
	documents.append(loader.load()[0])