VolkerChat / documents.py
timfe's picture
changed documents to section split
982eb2a
raw
history blame
6.63 kB
import numpy
import glob
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredFileLoader
import json
import pandas as pd
import re
from langchain_text_splitters import HTMLHeaderTextSplitter
def retrieve_sources():
# Die URL der Webseite, die du scrapen möchtest
base_url = 'https://www.fimohealth.com'
url = 'https://www.fimohealth.com/categories/long-covid/'
# Die Anfrage an die Webseite senden
response = requests.get(url)
# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
if response.status_code == 200:
# Den HTML-Inhalt der Webseite parsen
soup = BeautifulSoup(response.text, 'html.parser')
# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
# Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
links = soup.find_all('a')
urls = []
# Die gefundenen Links ausgeben
for link in links:
if "/gesundheitsblog/" in link.get('href'):
complete_url = base_url + link.get('href')
urls.append(complete_url)
else:
print('Fehler beim Abrufen der Webseite:', response.status_code)
return urls
def html_to_chunks():
urls = retrieve_sources()
docs = []
for url in urls:
# Assuming urls is a list of URLs and you want to fetch the content of the 5th URL
response = requests.get(url)
# Try decoding with different encodings until you find the correct one
encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
for encoding in encodings_to_try:
try:
content = response.content.decode(encoding)
break
except UnicodeDecodeError:
continue
# Parse the content using Beautiful Soup
#soup = BeautifulSoup(content, 'html.parser')
# Now you can navigate and extract data from the parsed HTML using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')
html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0])
def clean_article(text):
# Find the index of the word "Zurück"
index = text.find("Zurück")
# Extract the substring that comes after "Zurück"
substring = text[index + len("Zurück"):].strip()
# Ersetze ":in" durch "*in"
substring = re.sub(r':in', r'\*in', text)
return substring
html_string = clean_article(html_string)
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
("h4", "Header 4"),
("h5", "Header 5"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = html_splitter.split_text(html_string)
for chunk in chunks:
chunk.metadata["source"] = url
docs.append(chunk)
return docs
def retrieve_content(url):
def clean_article(text):
# Find the index of the word "Zurück"
index = text.find("Zurück")
# Extract the substring that comes after "Zurück"
substring = text[index + len("Zurück"):].strip()
# Ersetze ":in" durch "*in"
substring = re.sub(r':in', '\*in', text)
return substring
# Send a GET request to the webpage
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the webpage
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main elements you want to retrieve
main_elements = soup.find_all('main')
page_content = ""
# Iterate over the main elements and print their text content
for element in main_elements:
page_content += element.text
cleaned_page_content = clean_article(page_content)
return cleaned_page_content
else:
print('Failed to retrieve the webpage')
def create_documents():
urls = retrieve_sources()
# manually added
urls.append("https://www.fimohealth.com/patienten")
urls.append("https://www.fimohealth.com/patienten/long-covid")
documents = []
for index, url in enumerate(urls):
content = retrieve_content(url)
documents.append(Document(page_content=content, metadata={"source": url}))
# Get all the filenames from the docs folder
files = glob.glob("./docs/*.txt")
# Load files into readable documents
for file in files:
loader = UnstructuredFileLoader(file)
documents.append(loader.load()[0])
if len(documents) > 0:
return documents
else:
return TypeError
def create_faq_documents():
documents = []
df = pd.read_csv('./docs/faq.csv', sep=",")
for i, j in df.iterrows():
documents.append(Document(page_content=f"{j['Title']} \n {j['Text - de']}", metadata={"source": f"FAQ - {j['Bereich']}"}))
document_dicts = [doc.__dict__ for doc in documents]
# Write all documents to a single JSON file
file_path = './docs/faq_docs.json'
with open(file_path, "w") as file:
json.dump(document_dicts, file, indent=4)
print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")
def store_documents(documents, path="./docs/langchain_documents.json"):
# Convert each LangchainDocument object to a dictionary
document_dicts = [doc.__dict__ for doc in documents]
# Write all documents to a single JSON file
file_path = path
with open(file_path, "w") as file:
json.dump(document_dicts, file, indent=4)
print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")
def read_documents_from_file(file_path="./docs/langchain_documents.json"):
documents = []
try:
with open(file_path, "r") as file:
document_dicts = json.load(file)
for doc_dict in document_dicts:
document = Document(**doc_dict)
documents.append(document)
print(f"Successfully read {len(documents)} documents from '{file_path}'.")
return documents
except FileNotFoundError:
print(f"File '{file_path}' not found.")
return []
except Exception as e:
print(f"Error reading documents from '{file_path}': {e}")
return []