timfe commited on
Commit
982eb2a
·
1 Parent(s): 065afe6

changed documents to section split

Browse files
app.py CHANGED
@@ -7,28 +7,20 @@ from langchain_community.vectorstores import Chroma
7
  from langchain_core.output_parsers import StrOutputParser
8
  from langchain_core.runnables import RunnablePassthrough
9
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
- from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain.callbacks import get_openai_callback
12
  from langchain_core.prompts import ChatPromptTemplate
13
  from langchain_core.runnables import RunnableParallel
14
  from langchain import VectorDBQAWithSourcesChain
15
  from langchain.chains import RetrievalQA
16
  import json
17
- from documents import read_documents_from_file, create_faq_documents
18
 
 
 
19
  #create_faq_documents()
20
  OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
21
 
22
- # Get all the filenames from the docs folder
23
- # files = glob.glob("./docs/*.txt")
24
-
25
- # Load files into readable documents
26
- # docs = []
27
- # for file in files:
28
- # loader = UnstructuredFileLoader(file)
29
- # docs.append(loader.load()[0])
30
-
31
- # Config
32
 
33
  #vectorstore = Chroma(persist_directory=directory, embedding_function=OpenAIEmbeddings())
34
  st.set_page_config(initial_sidebar_state="collapsed")
@@ -43,7 +35,7 @@ if data_source == 'FAQ':
43
  def_chunk_overlap = 0
44
  directory = "./chroma_db"
45
  elif data_source == 'Blog articles':
46
- docs=read_documents_from_file()
47
  def_model = "gpt-3.5-turbo"
48
  def_temperature = 0.0
49
  def_k = 3
@@ -63,7 +55,16 @@ with st.sidebar:
63
  if st.toggle("Splitting", value=True, disabled=disabled):
64
  chunk_size = st.number_input("Chunk size", value=def_chunk_size, step=250, placeholder=def_chunk_size, disabled=disabled) # Defines the chunks in amount of tokens in which the files are split. Also defines the amount of tokens that are feeded into the context.
65
  chunk_overlap = st.number_input("Chunk overlap", value=def_chunk_overlap, step=10, placeholder=def_chunk_overlap, disabled=disabled)
66
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 
 
 
 
 
 
 
 
 
67
  splits = text_splitter.split_documents(docs)
68
  vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
69
  if chunk_size != def_chunk_size | chunk_overlap != def_chunk_overlap:
@@ -105,7 +106,7 @@ else:
105
  ###
106
  Du gibst keine Ratschläge zur Diagnose, Behandlung oder Therapie.
107
  Wenn du die Antwort nicht weißt oder du keinen Kontext hast, sage dass du es nicht weißt.
108
- Wenn du allgemeine unspezifische Fragen gestellt bekommst, antworte, dass du die Frage nicht verstehst frage nach einer präziseren Fragestellung.
109
  Antworte immer in ganzen Sätzen und verwende korrekte Grammatik und Rechtschreibung. Antworte nur auf Deutsch.
110
  Antworte kurz mit maximal fünf Sätzen außer es wird von dir eine ausführlichere Antwort verlangt.
111
  Verwende zur Beantwortung der Frage nur den vorhandenen Kontext.
@@ -190,10 +191,16 @@ if st.session_state.clicked:
190
  response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
191
  st.chat_message("assistant").write_stream(response_stream)
192
  with st.expander("Kontext ansehen"):
193
- for citation in response["context"]:
194
- st.write(str(citation.page_content))
195
- st.write(str(citation.metadata['source']))
196
- st.write(str("---")*20)
 
 
 
 
 
 
197
  with st.sidebar:
198
  sidebar_c = st.container()
199
  sidebar_c.success(cb)
@@ -204,7 +211,6 @@ if prompt := st.chat_input():
204
  st.chat_message("user").write(prompt)
205
  with get_openai_callback() as cb:
206
  response = rag_chain.invoke(prompt)
207
- print(response)
208
  if response['context'] != []:
209
  response_stream = response_generator(response['answer'])
210
  st.chat_message("assistant").write_stream(response_stream)
@@ -212,10 +218,16 @@ if prompt := st.chat_input():
212
  response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
213
  st.chat_message("assistant").write_stream(response_stream)
214
  with st.expander("Kontext ansehen"):
215
- for citation in response["context"]:
216
- st.write(str(citation.page_content))
217
- st.write(str(citation.metadata['source']))
218
- st.write(str("---")*20)
 
 
 
 
 
 
219
  with st.sidebar:
220
  sidebar_c = st.container()
221
  sidebar_c.success(cb)
 
7
  from langchain_core.output_parsers import StrOutputParser
8
  from langchain_core.runnables import RunnablePassthrough
9
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
11
  from langchain.callbacks import get_openai_callback
12
  from langchain_core.prompts import ChatPromptTemplate
13
  from langchain_core.runnables import RunnableParallel
14
  from langchain import VectorDBQAWithSourcesChain
15
  from langchain.chains import RetrievalQA
16
  import json
17
+ from documents import read_documents_from_file, create_documents, store_documents, create_faq_documents, html_to_chunks
18
 
19
+ #store_documents(html_to_chunks(), path="./docs/langchain_semantic_documents.json")
20
+ #store_documents(create_documents())
21
  #create_faq_documents()
22
  OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]
23
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  #vectorstore = Chroma(persist_directory=directory, embedding_function=OpenAIEmbeddings())
26
  st.set_page_config(initial_sidebar_state="collapsed")
 
35
  def_chunk_overlap = 0
36
  directory = "./chroma_db"
37
  elif data_source == 'Blog articles':
38
+ docs=read_documents_from_file("./docs/langchain_semantic_documents.json")
39
  def_model = "gpt-3.5-turbo"
40
  def_temperature = 0.0
41
  def_k = 3
 
55
  if st.toggle("Splitting", value=True, disabled=disabled):
56
  chunk_size = st.number_input("Chunk size", value=def_chunk_size, step=250, placeholder=def_chunk_size, disabled=disabled) # Defines the chunks in amount of tokens in which the files are split. Also defines the amount of tokens that are feeded into the context.
57
  chunk_overlap = st.number_input("Chunk overlap", value=def_chunk_overlap, step=10, placeholder=def_chunk_overlap, disabled=disabled)
58
+ text_splitter = RecursiveCharacterTextSplitter(
59
+ chunk_size=chunk_size,
60
+ chunk_overlap=chunk_overlap,
61
+ separators=[
62
+ "\n\n",
63
+ "\n",
64
+ " ",
65
+ ". "
66
+ ]
67
+ )
68
  splits = text_splitter.split_documents(docs)
69
  vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
70
  if chunk_size != def_chunk_size | chunk_overlap != def_chunk_overlap:
 
106
  ###
107
  Du gibst keine Ratschläge zur Diagnose, Behandlung oder Therapie.
108
  Wenn du die Antwort nicht weißt oder du keinen Kontext hast, sage dass du es nicht weißt.
109
+ Wenn du allgemeine unspezifische Fragen gestellt bekommst, antworte, dass du die Frage nicht verstehst und frage nach einer präziseren Fragestellung.
110
  Antworte immer in ganzen Sätzen und verwende korrekte Grammatik und Rechtschreibung. Antworte nur auf Deutsch.
111
  Antworte kurz mit maximal fünf Sätzen außer es wird von dir eine ausführlichere Antwort verlangt.
112
  Verwende zur Beantwortung der Frage nur den vorhandenen Kontext.
 
191
  response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
192
  st.chat_message("assistant").write_stream(response_stream)
193
  with st.expander("Kontext ansehen"):
194
+ if len(response['context'][0].page_content) > 50:
195
+ for i, citation in enumerate(response["context"]):
196
+ print(citation.metadata)
197
+ st.write(f"[{i+1}] ", str(citation.page_content))
198
+ st.write(str(citation.metadata['source']))
199
+ section = ""
200
+ for chapter in list(citation.metadata.values())[:-1]:
201
+ section += f"{chapter} "
202
+ st.write(f"Abschnitt: '{section}'")
203
+ st.write(str("---")*20)
204
  with st.sidebar:
205
  sidebar_c = st.container()
206
  sidebar_c.success(cb)
 
211
  st.chat_message("user").write(prompt)
212
  with get_openai_callback() as cb:
213
  response = rag_chain.invoke(prompt)
 
214
  if response['context'] != []:
215
  response_stream = response_generator(response['answer'])
216
  st.chat_message("assistant").write_stream(response_stream)
 
218
  response_stream = response_generator("Dazu kann ich dir leider keine Antwort geben. Bitte versuche eine andere Frage.")
219
  st.chat_message("assistant").write_stream(response_stream)
220
  with st.expander("Kontext ansehen"):
221
+ if len(response['context'][0].page_content) > 50:
222
+ for i, citation in enumerate(response["context"]):
223
+ print(citation.metadata)
224
+ st.write(f"[{i+1}] ", str(citation.page_content))
225
+ st.write(str(citation.metadata['source']))
226
+ section = ""
227
+ for chapter in list(citation.metadata.values())[:-1]:
228
+ section += f"{chapter} "
229
+ st.write(f"Abschnitt: '{section}'")
230
+ st.write(str("---")*20)
231
  with st.sidebar:
232
  sidebar_c = st.container()
233
  sidebar_c.success(cb)
docs/langchain_documents.json CHANGED
The diff for this file is too large to render. See raw diff
 
docs/langchain_semantic_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
documents.py CHANGED
@@ -6,6 +6,8 @@ from langchain.docstore.document import Document
6
  from langchain_community.document_loaders import UnstructuredFileLoader
7
  import json
8
  import pandas as pd
 
 
9
 
10
 
11
  def retrieve_sources():
@@ -36,6 +38,58 @@ def retrieve_sources():
36
 
37
  return urls
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def retrieve_content(url):
40
  def clean_article(text):
41
  # Find the index of the word "Zurück"
@@ -43,6 +97,10 @@ def retrieve_content(url):
43
 
44
  # Extract the substring that comes after "Zurück"
45
  substring = text[index + len("Zurück"):].strip()
 
 
 
 
46
  return substring
47
  # Send a GET request to the webpage
48
  response = requests.get(url)
@@ -81,6 +139,11 @@ def create_documents():
81
  for file in files:
82
  loader = UnstructuredFileLoader(file)
83
  documents.append(loader.load()[0])
 
 
 
 
 
84
 
85
  def create_faq_documents():
86
  documents = []
 
6
  from langchain_community.document_loaders import UnstructuredFileLoader
7
  import json
8
  import pandas as pd
9
+ import re
10
+ from langchain_text_splitters import HTMLHeaderTextSplitter
11
 
12
 
13
  def retrieve_sources():
 
38
 
39
  return urls
40
 
41
+ def html_to_chunks():
42
+ urls = retrieve_sources()
43
+ docs = []
44
+ for url in urls:
45
+ # Assuming urls is a list of URLs and you want to fetch the content of the 5th URL
46
+ response = requests.get(url)
47
+
48
+ # Try decoding with different encodings until you find the correct one
49
+ encodings_to_try = ['utf-8', 'latin-1', 'ISO-8859-1']
50
+ for encoding in encodings_to_try:
51
+ try:
52
+ content = response.content.decode(encoding)
53
+ break
54
+ except UnicodeDecodeError:
55
+ continue
56
+
57
+ # Parse the content using Beautiful Soup
58
+ #soup = BeautifulSoup(content, 'html.parser')
59
+
60
+ # Now you can navigate and extract data from the parsed HTML using Beautiful Soup
61
+
62
+ soup = BeautifulSoup(response.content, 'html.parser')
63
+ html_string = str(soup.find_all('section', {"class": "section-blog-template-article"})[0])
64
+ def clean_article(text):
65
+ # Find the index of the word "Zurück"
66
+ index = text.find("Zurück")
67
+
68
+ # Extract the substring that comes after "Zurück"
69
+ substring = text[index + len("Zurück"):].strip()
70
+
71
+ # Ersetze ":in" durch "*in"
72
+ substring = re.sub(r':in', r'\*in', text)
73
+
74
+ return substring
75
+
76
+ html_string = clean_article(html_string)
77
+
78
+ headers_to_split_on = [
79
+ ("h1", "Header 1"),
80
+ ("h2", "Header 2"),
81
+ ("h3", "Header 3"),
82
+ ("h4", "Header 4"),
83
+ ("h5", "Header 5"),
84
+ ]
85
+
86
+ html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
87
+ chunks = html_splitter.split_text(html_string)
88
+ for chunk in chunks:
89
+ chunk.metadata["source"] = url
90
+ docs.append(chunk)
91
+ return docs
92
+
93
  def retrieve_content(url):
94
  def clean_article(text):
95
  # Find the index of the word "Zurück"
 
97
 
98
  # Extract the substring that comes after "Zurück"
99
  substring = text[index + len("Zurück"):].strip()
100
+
101
+ # Ersetze ":in" durch "*in"
102
+ substring = re.sub(r':in', '\*in', text)
103
+
104
  return substring
105
  # Send a GET request to the webpage
106
  response = requests.get(url)
 
139
  for file in files:
140
  loader = UnstructuredFileLoader(file)
141
  documents.append(loader.load()[0])
142
+
143
+ if len(documents) > 0:
144
+ return documents
145
+ else:
146
+ return TypeError
147
 
148
  def create_faq_documents():
149
  documents = []