Spaces:

timFimo
/

VolkerChat

Runtime error

App Files Files

VolkerChat / documents.py

timfe

fixed minor document

5c2f552 over 1 year ago

raw

history blame

4.55 kB

	import numpy
	import glob
	import requests
	from bs4 import BeautifulSoup
	from langchain.docstore.document import Document
	from langchain_community.document_loaders import UnstructuredFileLoader
	import json
	import pandas as pd


	def retrieve_sources():
	# Die URL der Webseite, die du scrapen möchtest
	base_url = 'https://www.fimohealth.com'
	url = 'https://www.fimohealth.com/categories/long-covid/'

	# Die Anfrage an die Webseite senden
	response = requests.get(url)

	# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
	if response.status_code == 200:
	# Den HTML-Inhalt der Webseite parsen
	soup = BeautifulSoup(response.text, 'html.parser')

	# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
	# Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
	links = soup.find_all('a')

	urls = []
	# Die gefundenen Links ausgeben
	for link in links:
	if "/gesundheitsblog/" in link.get('href'):
	complete_url = base_url + link.get('href')
	urls.append(complete_url)
	else:
	print('Fehler beim Abrufen der Webseite:', response.status_code)

	return urls

	def retrieve_content(url):
	def clean_article(text):
	# Find the index of the word "Zurück"
	index = text.find("Zurück")

	# Extract the substring that comes after "Zurück"
	substring = text[index + len("Zurück"):].strip()
	return substring
	# Send a GET request to the webpage
	response = requests.get(url)
	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content of the webpage
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the main elements you want to retrieve
	main_elements = soup.find_all('main')
	page_content = ""
	# Iterate over the main elements and print their text content
	for element in main_elements:
	page_content += element.text

	cleaned_page_content = clean_article(page_content)
	return cleaned_page_content
	else:
	print('Failed to retrieve the webpage')

	def create_documents():
	urls = retrieve_sources()
	# manually added
	urls.append("https://www.fimohealth.com/patienten")
	urls.append("https://www.fimohealth.com/patienten/long-covid")

	documents = []
	for index, url in enumerate(urls):
	content = retrieve_content(url)
	documents.append(Document(page_content=content, metadata={"source": url}))

	# Get all the filenames from the docs folder
	files = glob.glob("./docs/*.txt")

	# Load files into readable documents
	for file in files:
	loader = UnstructuredFileLoader(file)
	documents.append(loader.load()[0])

	def create_faq_documents():
	documents = []
	df = pd.read_csv('./docs/faq.csv', sep=",")
	for i, j in df.iterrows():
	documents.append(Document(page_content=f"{j['Title']} \n {j['Text - de']}", metadata={"source": f"FAQ - {j['Bereich']}"}))

	document_dicts = [doc.__dict__ for doc in documents]

	# Write all documents to a single JSON file
	file_path = './docs/faq_docs.json'
	with open(file_path, "w") as file:
	json.dump(document_dicts, file, indent=4)

	print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")

	def store_documents(documents, path="./docs/langchain_documents.json"):
	# Convert each LangchainDocument object to a dictionary
	document_dicts = [doc.__dict__ for doc in documents]

	# Write all documents to a single JSON file
	file_path = path
	with open(file_path, "w") as file:
	json.dump(document_dicts, file, indent=4)

	print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")

	def read_documents_from_file(file_path="./docs/langchain_documents.json"):
	documents = []
	try:
	with open(file_path, "r") as file:
	document_dicts = json.load(file)
	for doc_dict in document_dicts:
	document = Document(**doc_dict)
	documents.append(document)
	print(f"Successfully read {len(documents)} documents from '{file_path}'.")
	return documents
	except FileNotFoundError:
	print(f"File '{file_path}' not found.")
	return []
	except Exception as e:
	print(f"Error reading documents from '{file_path}': {e}")
	return []