Spaces:
Runtime error
Runtime error
File size: 6,626 Bytes
e5f6996 dd156bd 982eb2a e5f6996 982eb2a e5f6996 982eb2a e5f6996 dd156bd e5f6996 dd156bd e5f6996 dd156bd 982eb2a e5f6996 dd156bd 5c2f552 e5f6996 dd156bd e5f6996 dd156bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import numpy
import glob
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain_community.document_loaders import UnstructuredFileLoader
import json
import pandas as pd
import re
from langchain_text_splitters import HTMLHeaderTextSplitter
def retrieve_sources():
# Die URL der Webseite, die du scrapen möchtest
base_url = 'https://www.fimohealth.com'
url = 'https://www.fimohealth.com/categories/long-covid/'
# Die Anfrage an die Webseite senden
response = requests.get(url)
# Sicherstellen, dass die Anfrage erfolgreich war (Statuscode 200)
if response.status_code == 200:
# Den HTML-Inhalt der Webseite parsen
soup = BeautifulSoup(response.text, 'html.parser')
# Hier kannst du mit BeautifulSoup arbeiten, um spezifische Elemente zu finden und zu extrahieren
# Zum Beispiel, um alle <a> Tags auf der Seite zu finden:
links = soup.find_all('a')
urls = []
# Die gefundenen Links ausgeben
for link in links:
if "/gesundheitsblog/" in link.get('href'):
complete_url = base_url + link.get('href')
urls.append(complete_url)
else:
print('Fehler beim Abrufen der Webseite:', response.status_code)
return urls
def html_to_chunks():
urls = retrieve_sources()
docs = []
for url in urls:
# Assuming urls is a list of URLs and you want to fetch the content of the 5th URL
response = requests.get(url)
# Try decoding with different encodings until you find the correct one
encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
for encoding in encodings_to_try:
try:
content = response.content.decode(encoding)
break
except UnicodeDecodeError:
continue
# Parse the content using Beautiful Soup
#soup = BeautifulSoup(content, 'html.parser')
# Now you can navigate and extract data from the parsed HTML using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')
html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0])
def clean_article(text):
# Find the index of the word "Zurück"
index = text.find("Zurück")
# Extract the substring that comes after "Zurück"
substring = text[index + len("Zurück"):].strip()
# Ersetze ":in" durch "*in"
substring = re.sub(r':in', r'\*in', text)
return substring
html_string = clean_article(html_string)
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
("h4", "Header 4"),
("h5", "Header 5"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = html_splitter.split_text(html_string)
for chunk in chunks:
chunk.metadata["source"] = url
docs.append(chunk)
return docs
def retrieve_content(url):
def clean_article(text):
# Find the index of the word "Zurück"
index = text.find("Zurück")
# Extract the substring that comes after "Zurück"
substring = text[index + len("Zurück"):].strip()
# Ersetze ":in" durch "*in"
substring = re.sub(r':in', '\*in', text)
return substring
# Send a GET request to the webpage
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content of the webpage
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main elements you want to retrieve
main_elements = soup.find_all('main')
page_content = ""
# Iterate over the main elements and print their text content
for element in main_elements:
page_content += element.text
cleaned_page_content = clean_article(page_content)
return cleaned_page_content
else:
print('Failed to retrieve the webpage')
def create_documents():
urls = retrieve_sources()
# manually added
urls.append("https://www.fimohealth.com/patienten")
urls.append("https://www.fimohealth.com/patienten/long-covid")
documents = []
for index, url in enumerate(urls):
content = retrieve_content(url)
documents.append(Document(page_content=content, metadata={"source": url}))
# Get all the filenames from the docs folder
files = glob.glob("./docs/*.txt")
# Load files into readable documents
for file in files:
loader = UnstructuredFileLoader(file)
documents.append(loader.load()[0])
if len(documents) > 0:
return documents
else:
return TypeError
def create_faq_documents():
documents = []
df = pd.read_csv('./docs/faq.csv', sep=",")
for i, j in df.iterrows():
documents.append(Document(page_content=f"{j['Title']} \n {j['Text - de']}", metadata={"source": f"FAQ - {j['Bereich']}"}))
document_dicts = [doc.__dict__ for doc in documents]
# Write all documents to a single JSON file
file_path = './docs/faq_docs.json'
with open(file_path, "w") as file:
json.dump(document_dicts, file, indent=4)
print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")
def store_documents(documents, path="./docs/langchain_documents.json"):
# Convert each LangchainDocument object to a dictionary
document_dicts = [doc.__dict__ for doc in documents]
# Write all documents to a single JSON file
file_path = path
with open(file_path, "w") as file:
json.dump(document_dicts, file, indent=4)
print(f"All {len(documents)} langchain documents have been saved to '{file_path}'.")
def read_documents_from_file(file_path="./docs/langchain_documents.json"):
documents = []
try:
with open(file_path, "r") as file:
document_dicts = json.load(file)
for doc_dict in document_dicts:
document = Document(**doc_dict)
documents.append(document)
print(f"Successfully read {len(documents)} documents from '{file_path}'.")
return documents
except FileNotFoundError:
print(f"File '{file_path}' not found.")
return []
except Exception as e:
print(f"Error reading documents from '{file_path}': {e}")
return [] |