Spaces:

timFimo
/

VolkerChat

Runtime error

App Files Files

timfe commited on Apr 12, 2024

Commit

982eb2a

1 Parent(s): 065afe6

changed documents to section split

Browse files

Files changed (4) hide show

app.py +36 -24
docs/langchain_documents.json +0 -0
docs/langchain_semantic_documents.json +0 -0
documents.py +63 -0

app.py CHANGED Viewed

@@ -7,28 +7,20 @@ from langchain_community.vectorstores import Chroma
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.callbacks import get_openai_callback
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain import VectorDBQAWithSourcesChain
 from langchain.chains import RetrievalQA
 import json
-from documents import read_documents_from_file, create_faq_documents
 #create_faq_documents()
 OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
-# Get all the filenames from the docs folder
-# files = glob.glob("./docs/*.txt")
-# Load files into readable documents
-# docs = []
-# for file in files:
-#     loader = UnstructuredFileLoader(file)
-#     docs.append(loader.load()[0])
-# Config
 #vectorstore = Chroma(persist_directory=directory, embedding_function=OpenAIEmbeddings())
 st.set_page_config(initial_sidebar_state="collapsed")
@@ -43,7 +35,7 @@ if data_source == 'FAQ':
     def_chunk_overlap = 0
     directory = "./chroma_db"
 elif data_source == 'Blog articles':
-    docs=read_documents_from_file()
     def_model = "gpt-3.5-turbo"
     def_temperature = 0.0
     def_k = 3
@@ -63,7 +55,16 @@ with st.sidebar:
     if st.toggle("Splitting", value=True, disabled=disabled):
         chunk_size = st.number_input("Chunk size", value=def_chunk_size, step=250, placeholder=def_chunk_size, disabled=disabled) # Defines the chunks in amount of tokens in which the files are split. Also defines the amount of tokens that are feeded into the context.
         chunk_overlap = st.number_input("Chunk overlap", value=def_chunk_overlap, step=10, placeholder=def_chunk_overlap, disabled=disabled)
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         splits = text_splitter.split_documents(docs)
         vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
         if chunk_size != def_chunk_size | chunk_overlap != def_chunk_overlap:
@@ -105,7 +106,7 @@ else:
     ###
     Du gibst keine Ratschläge zur Diagnose, Behandlung oder Therapie.
     Wenn du die Antwort nicht weißt oder du keinen Kontext hast, sage dass du es nicht weißt.
-    Wenn du allgemeine unspezifische Fragen gestellt bekommst, antworte, dass du die Frage nicht verstehst frage nach einer präziseren Fragestellung.
     Antworte immer in ganzen Sätzen und verwende korrekte Grammatik und Rechtschreibung. Antworte nur auf Deutsch.
     Antworte kurz mit maximal fünf Sätzen außer es wird von dir eine ausführlichere Antwort verlangt.
     Verwende zur Beantwortung der Frage nur den vorhandenen Kontext.
@@ -190,10 +191,16 @@ if st.session_state.clicked:
             response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
             st.chat_message("assistant").write_stream(response_stream)
         with st.expander("Kontext ansehen"):
-            for citation in response["context"]:
-                st.write(str(citation.page_content))
-                st.write(str(citation.metadata['source']))
-                st.write(str("---")*20)
         with st.sidebar:
             sidebar_c = st.container()
             sidebar_c.success(cb)
@@ -204,7 +211,6 @@ if prompt := st.chat_input():
     st.chat_message("user").write(prompt)
     with get_openai_callback() as cb:
         response = rag_chain.invoke(prompt)
-        print(response)
         if response['context'] != []:
             response_stream = response_generator(response['answer'])
             st.chat_message("assistant").write_stream(response_stream)
@@ -212,10 +218,16 @@ if prompt := st.chat_input():
             response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
             st.chat_message("assistant").write_stream(response_stream)
         with st.expander("Kontext ansehen"):
-            for citation in response["context"]:
-                st.write(str(citation.page_content))
-                st.write(str(citation.metadata['source']))
-                st.write(str("---")*20)
         with st.sidebar:
             sidebar_c = st.container()
             sidebar_c.success(cb)

 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
 from langchain.callbacks import get_openai_callback
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
 from langchain import VectorDBQAWithSourcesChain
 from langchain.chains import RetrievalQA
 import json
+from documents import read_documents_from_file, create_documents, store_documents, create_faq_documents, html_to_chunks
+#store_documents(html_to_chunks(), path="./docs/langchain_semantic_documents.json")
+#store_documents(create_documents())
 #create_faq_documents()
 OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
 #vectorstore = Chroma(persist_directory=directory, embedding_function=OpenAIEmbeddings())
 st.set_page_config(initial_sidebar_state="collapsed")
     def_chunk_overlap = 0
     directory = "./chroma_db"
 elif data_source == 'Blog articles':
+    docs=read_documents_from_file("./docs/langchain_semantic_documents.json")
     def_model = "gpt-3.5-turbo"
     def_temperature = 0.0
     def_k = 3
     if st.toggle("Splitting", value=True, disabled=disabled):
         chunk_size = st.number_input("Chunk size", value=def_chunk_size, step=250, placeholder=def_chunk_size, disabled=disabled) # Defines the chunks in amount of tokens in which the files are split. Also defines the amount of tokens that are feeded into the context.
         chunk_overlap = st.number_input("Chunk overlap", value=def_chunk_overlap, step=10, placeholder=def_chunk_overlap, disabled=disabled)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=[
+                "\n\n",
+                "\n",
+                " ",
+                ". "
+                ]
+            )
         splits = text_splitter.split_documents(docs)
         vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
         if chunk_size != def_chunk_size | chunk_overlap != def_chunk_overlap:
     ###
     Du gibst keine Ratschläge zur Diagnose, Behandlung oder Therapie.
     Wenn du die Antwort nicht weißt oder du keinen Kontext hast, sage dass du es nicht weißt.
+    Wenn du allgemeine unspezifische Fragen gestellt bekommst, antworte, dass du die Frage nicht verstehst und frage nach einer präziseren Fragestellung.
     Antworte immer in ganzen Sätzen und verwende korrekte Grammatik und Rechtschreibung. Antworte nur auf Deutsch.
     Antworte kurz mit maximal fünf Sätzen außer es wird von dir eine ausführlichere Antwort verlangt.
     Verwende zur Beantwortung der Frage nur den vorhandenen Kontext.
             response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
             st.chat_message("assistant").write_stream(response_stream)
         with st.expander("Kontext ansehen"):
+            if len(response['context'][0].page_content) > 50:
+                for i, citation in enumerate(response["context"]):
+                    print(citation.metadata)
+                    st.write(f"[{i+1}] ", str(citation.page_content))
+                    st.write(str(citation.metadata['source']))
+                    section = ""
+                    for chapter in list(citation.metadata.values())[:-1]:
+                        section += f"{chapter} "
+                    st.write(f"Abschnitt: '{section}'")
+                    st.write(str("---")*20)
         with st.sidebar:
             sidebar_c = st.container()
             sidebar_c.success(cb)
     st.chat_message("user").write(prompt)
     with get_openai_callback() as cb:
         response = rag_chain.invoke(prompt)
         if response['context'] != []:
             response_stream = response_generator(response['answer'])
             st.chat_message("assistant").write_stream(response_stream)
             response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
             st.chat_message("assistant").write_stream(response_stream)
         with st.expander("Kontext ansehen"):
+            if len(response['context'][0].page_content) > 50:
+                for i, citation in enumerate(response["context"]):
+                    print(citation.metadata)
+                    st.write(f"[{i+1}] ", str(citation.page_content))
+                    st.write(str(citation.metadata['source']))
+                    section = ""
+                    for chapter in list(citation.metadata.values())[:-1]:
+                        section += f"{chapter} "
+                    st.write(f"Abschnitt: '{section}'")
+                    st.write(str("---")*20)
         with st.sidebar:
             sidebar_c = st.container()
             sidebar_c.success(cb)

docs/langchain_documents.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

docs/langchain_semantic_documents.json ADDED Viewed

The diff for this file is too large to render. See raw diff

documents.py CHANGED Viewed

@@ -6,6 +6,8 @@ from langchain.docstore.document import Document
 from langchain_community.document_loaders import UnstructuredFileLoader
 import json
 import pandas as pd
 def retrieve_sources():
@@ -36,6 +38,58 @@ def retrieve_sources():
     return urls
 def retrieve_content(url):
     def clean_article(text):
         # Find the index of the word "Zurück"
@@ -43,6 +97,10 @@ def retrieve_content(url):
         # Extract the substring that comes after "Zurück"
         substring = text[index + len("Zurück"):].strip()
         return substring
     # Send a GET request to the webpage
     response = requests.get(url)
@@ -81,6 +139,11 @@ def create_documents():
     for file in files:
         loader = UnstructuredFileLoader(file)
         documents.append(loader.load()[0])
 def create_faq_documents():
     documents = []

 from langchain_community.document_loaders import UnstructuredFileLoader
 import json
 import pandas as pd
+import re
+from langchain_text_splitters import HTMLHeaderTextSplitter
 def retrieve_sources():
     return urls
+def html_to_chunks():
+    urls = retrieve_sources()
+    docs = []
+    for url in urls:
+        # Assuming urls is a list of URLs and you want to fetch the content of the 5th URL
+        response = requests.get(url)
+        # Try decoding with different encodings until you find the correct one
+        encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
+        for encoding in encodings_to_try:
+            try:
+                content = response.content.decode(encoding)
+                break
+            except UnicodeDecodeError:
+                continue
+        # Parse the content using Beautiful Soup
+        #soup = BeautifulSoup(content, 'html.parser')
+        # Now you can navigate and extract data from the parsed HTML using Beautiful Soup
+        soup = BeautifulSoup(response.content, 'html.parser')
+        html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0])
+        def clean_article(text):
+            # Find the index of the word "Zurück"
+            index = text.find("Zurück")
+            # Extract the substring that comes after "Zurück"
+            substring = text[index + len("Zurück"):].strip()
+            # Ersetze ":in" durch "*in"
+            substring = re.sub(r':in', r'\*in', text)
+            return substring
+        html_string = clean_article(html_string)
+        headers_to_split_on = [
+            ("h1", "Header 1"),
+            ("h2", "Header 2"),
+            ("h3", "Header 3"),
+            ("h4", "Header 4"),
+            ("h5", "Header 5"),
+        ]
+        html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        chunks = html_splitter.split_text(html_string)
+        for chunk in chunks:
+            chunk.metadata["source"] = url
+            docs.append(chunk)
+    return docs
 def retrieve_content(url):
     def clean_article(text):
         # Find the index of the word "Zurück"
         # Extract the substring that comes after "Zurück"
         substring = text[index + len("Zurück"):].strip()
+        # Ersetze ":in" durch "*in"
+        substring = re.sub(r':in', '\*in', text)
         return substring
     # Send a GET request to the webpage
     response = requests.get(url)
     for file in files:
         loader = UnstructuredFileLoader(file)
         documents.append(loader.load()[0])
+    if len(documents) > 0:
+        return documents
+    else:
+        return TypeError
 def create_faq_documents():
     documents = []