Asankhaya Sharma commited on
Commit
4e00df7
·
1 Parent(s): a197a48

initial commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/Dockerfile
2
+ FROM python:3.11-slim
3
+
4
+ WORKDIR /app
5
+
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY . /app
14
+
15
+ ## Mount .streamlit folder to load config.toml and secrets.toml
16
+
17
+ RUN pip3 install -r requirements.txt
18
+
19
+ EXPOSE 8501
20
+
21
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
+
23
+ VOLUME [ "/root/.streamlit" ]
24
+
25
+ ENTRYPOINT ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"]
brain.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import streamlit as st
3
+
4
+
5
+ def brain(supabase):
6
+ ## List all documents
7
+ response = supabase.table("documents").select("name:metadata->>file_name, size:metadata->>file_size", count="exact").execute()
8
+
9
+ documents = response.data # Access the data from the response
10
+
11
+ # Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
12
+ unique_data = [dict(t) for t in set(tuple(d.items()) for d in documents)]
13
+
14
+ # Sort the list of documents by size in decreasing order
15
+ unique_data.sort(key=lambda x: int(x['size']), reverse=True)
16
+
17
+ # Display some metrics at the top of the page
18
+ col1, col2 = st.columns(2)
19
+ col1.metric(label="Total Documents", value=len(unique_data))
20
+ col2.metric(label="Total Size (bytes)", value=sum(int(doc['size']) for doc in unique_data))
21
+
22
+ for document in unique_data:
23
+ # Create a unique key for each button by using the document name
24
+ button_key = f"delete_{document['name']}"
25
+
26
+ # Display the document name, size and the delete button on the same line
27
+ col1, col2, col3 = st.columns([3, 1, 1])
28
+ col1.markdown(f"**{document['name']}** ({document['size']} bytes)")
29
+
30
+ if col2.button('❌', key=button_key):
31
+ delete_document(supabase, document['name'])
32
+
33
+ def delete_document(supabase, document_name):
34
+ # Delete the document from the database
35
+ response = supabase.table("documents").delete().match({"metadata->>file_name": document_name}).execute()
36
+ # Check if the deletion was successful
37
+ if len(response.data) > 0:
38
+ st.write(f"✂️ {document_name} was deleted.")
39
+ else:
40
+ st.write(f"❌ {document_name} was not deleted.")
components_keys.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Store streamlit component keys"""
2
+
3
+ class ComponentsKeys:
4
+ FILE_UPLOADER = "file_uploader"
explorer.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def view_document(supabase):
5
+ # Get the document from the database
6
+ response = supabase.table("documents").select("content").execute()
7
+ st.write("**This feature is in active development**")
8
+ # Display a list of elements from the documents
9
+ # If the user clicks on an element, display the content of the document
10
+ for document in response.data:
11
+ if st.button(document['content'][:50].replace("\n", " ")):
12
+ continue
files.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import (
3
+ Any,
4
+ Union,
5
+ )
6
+ import zipfile
7
+ import streamlit as st
8
+ from streamlit.runtime.uploaded_file_manager import (
9
+ UploadedFile,
10
+ UploadedFileRec,
11
+ UploadedFileManager,
12
+ )
13
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
14
+ from supabase.client import Client
15
+ from langchain.vectorstores.supabase import SupabaseVectorStore
16
+ from components_keys import ComponentsKeys
17
+ from loaders.audio import process_audio
18
+ from loaders.txt import process_txt
19
+ from loaders.csv import process_csv
20
+ from loaders.markdown import process_markdown
21
+ from loaders.pdf import process_pdf
22
+ from loaders.html import (
23
+ create_html_file,
24
+ delete_tempfile,
25
+ get_html,
26
+ process_html,
27
+ )
28
+ from loaders.powerpoint import process_powerpoint
29
+ from loaders.docx import process_docx
30
+ from utils import compute_sha1_from_content
31
+
32
+
33
+ ctx = get_script_run_ctx()
34
+ manager = UploadedFileManager()
35
+ file_processors = {
36
+ ".txt": process_txt,
37
+ ".csv": process_csv,
38
+ ".md": process_markdown,
39
+ ".markdown": process_markdown,
40
+ ".m4a": process_audio,
41
+ ".mp3": process_audio,
42
+ ".webm": process_audio,
43
+ ".mp4": process_audio,
44
+ ".mpga": process_audio,
45
+ ".wav": process_audio,
46
+ ".mpeg": process_audio,
47
+ ".pdf": process_pdf,
48
+ ".html": process_html,
49
+ ".pptx": process_powerpoint,
50
+ ".docx": process_docx
51
+ }
52
+
53
+ def file_uploader(supabase, vector_store):
54
+ # Omit zip file support if the `st.secrets.self_hosted` != "true" because
55
+ # a zip file can consist of multiple files so the limit on 1 file uploaded
56
+ # at a time in the demo can be circumvented.
57
+ accepted_file_extensions = list(file_processors.keys())
58
+ accept_multiple_files = st.secrets.self_hosted == "true"
59
+ if accept_multiple_files:
60
+ accepted_file_extensions += [".zip"]
61
+
62
+ files = st.file_uploader(
63
+ "**Upload a file**",
64
+ accept_multiple_files=accept_multiple_files,
65
+ type=accepted_file_extensions,
66
+ key=ComponentsKeys.FILE_UPLOADER,
67
+ )
68
+ if st.secrets.self_hosted == "false":
69
+ st.markdown("**In demo mode, the max file size is 1MB**")
70
+ if st.button("Add to Database"):
71
+ # Single file upload
72
+ if isinstance(files, UploadedFile):
73
+ filter_file(files, supabase, vector_store)
74
+ # Multiple files upload
75
+ elif isinstance(files, list):
76
+ for file in files:
77
+ filter_file(file, supabase, vector_store)
78
+
79
+ def file_already_exists(supabase, file):
80
+ file_sha1 = compute_sha1_from_content(file.getvalue())
81
+ response = supabase.table("documents").select("id").eq("metadata->>file_sha1", file_sha1).execute()
82
+ return len(response.data) > 0
83
+
84
+ def file_to_uploaded_file(file: Any) -> Union[None, UploadedFile]:
85
+ """Convert a file to a streamlit `UploadedFile` object.
86
+
87
+ This allows us to unzip files and treat them the same way
88
+ streamlit treats files uploaded through the file uploader.
89
+
90
+ Parameters
91
+ ---------
92
+ file : Any
93
+ The file. Can be any file supported by this app.
94
+
95
+ Returns
96
+ -------
97
+ Union[None, UploadedFile]
98
+ The file converted to a streamlit `UploadedFile` object.
99
+ Returns `None` if the script context cannot be grabbed.
100
+ """
101
+
102
+ if ctx is None:
103
+ print("script context not found, skipping uploading file:", file.name)
104
+ return
105
+
106
+ file_extension = os.path.splitext(file.name)[-1]
107
+ file_name = file.name
108
+ file_data = file.read()
109
+ # The file manager will automatically assign an ID so pass `None`
110
+ # Reference: https://github.com/streamlit/streamlit/blob/9a6ce804b7977bdc1f18906d1672c45f9a9b3398/lib/streamlit/runtime/uploaded_file_manager.py#LL98C6-L98C6
111
+ uploaded_file_rec = UploadedFileRec(None, file_name, file_extension, file_data)
112
+ uploaded_file_rec = manager.add_file(
113
+ ctx.session_id,
114
+ ComponentsKeys.FILE_UPLOADER,
115
+ uploaded_file_rec,
116
+ )
117
+ return UploadedFile(uploaded_file_rec)
118
+
119
+ def filter_zip_file(
120
+ file: UploadedFile,
121
+ supabase: Client,
122
+ vector_store: SupabaseVectorStore,
123
+ ) -> None:
124
+ """Unzip the zip file then filter each unzipped file.
125
+
126
+ Parameters
127
+ ----------
128
+ file : UploadedFile
129
+ The uploaded file from the file uploader.
130
+ supabase : Client
131
+ The supabase client.
132
+ vector_store : SupabaseVectorStore
133
+ The vector store in the database.
134
+ """
135
+
136
+ with zipfile.ZipFile(file, "r") as z:
137
+ unzipped_files = z.namelist()
138
+ for unzipped_file in unzipped_files:
139
+ with z.open(unzipped_file, "r") as f:
140
+ filter_file(f, supabase, vector_store)
141
+
142
+ def filter_file(file, supabase, vector_store):
143
+ # Streamlit file uploads are of type `UploadedFile` which has the
144
+ # necessary methods and attributes for this app to work.
145
+ if not isinstance(file, UploadedFile):
146
+ file = file_to_uploaded_file(file)
147
+
148
+ file_extension = os.path.splitext(file.name)[-1]
149
+ if file_extension == ".zip":
150
+ filter_zip_file(file, supabase, vector_store)
151
+ return True
152
+
153
+ if file_already_exists(supabase, file):
154
+ st.write(f"😎 {file.name} is already in the database.")
155
+ return False
156
+
157
+ if file.size < 1:
158
+ st.write(f"💨 {file.name} is empty.")
159
+ return False
160
+
161
+ if file_extension in file_processors:
162
+ if st.secrets.self_hosted == "false":
163
+ file_processors[file_extension](vector_store, file, stats_db=supabase)
164
+ else:
165
+ file_processors[file_extension](vector_store, file, stats_db=None)
166
+ st.write(f"✅ {file.name} ")
167
+ return True
168
+
169
+ st.write(f"❌ {file.name} is not a valid file type.")
170
+ return False
171
+
172
+ def url_uploader(supabase, vector_store):
173
+ url = st.text_area("**Add an url**",placeholder="https://www.quivr.app")
174
+ button = st.button("Add the URL to the database")
175
+
176
+ if button:
177
+ if not st.session_state["overused"]:
178
+ html = get_html(url)
179
+ if html:
180
+ st.write(f"Getting content ... {url} ")
181
+ try:
182
+ file, temp_file_path = create_html_file(url, html)
183
+ except UnicodeEncodeError as e:
184
+ st.write(f"❌ Error encoding character: {e}")
185
+ file, temp_file_path = create_html_file(url, html)
186
+ ret = filter_file(file, supabase, vector_store)
187
+ delete_tempfile(temp_file_path, url, ret)
188
+ else:
189
+ st.write(f"❌ Failed to access to {url} .")
190
+ else:
191
+ st.write("You have reached your daily limit. Please come back later or self host the solution.")
loaders/__init__.py ADDED
File without changes
loaders/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (144 Bytes). View file
 
loaders/__pycache__/audio.cpython-310.pyc ADDED
Binary file (2.39 kB). View file
 
loaders/__pycache__/common.cpython-310.pyc ADDED
Binary file (1.69 kB). View file
 
loaders/__pycache__/csv.cpython-310.pyc ADDED
Binary file (425 Bytes). View file
 
loaders/__pycache__/docx.cpython-310.pyc ADDED
Binary file (422 Bytes). View file
 
loaders/__pycache__/html.cpython-310.pyc ADDED
Binary file (1.97 kB). View file
 
loaders/__pycache__/markdown.cpython-310.pyc ADDED
Binary file (440 Bytes). View file
 
loaders/__pycache__/pdf.cpython-310.pyc ADDED
Binary file (416 Bytes). View file
 
loaders/__pycache__/powerpoint.cpython-310.pyc ADDED
Binary file (448 Bytes). View file
 
loaders/__pycache__/txt.cpython-310.pyc ADDED
Binary file (415 Bytes). View file
 
loaders/audio.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from io import BytesIO
4
+ import time
5
+ import openai
6
+ import streamlit as st
7
+ from langchain.document_loaders import TextLoader
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from utils import compute_sha1_from_content
11
+ from langchain.schema import Document
12
+ from stats import add_usage
13
+
14
+
15
+
16
+ # Create a function to transcribe audio using Whisper
17
+ def _transcribe_audio(api_key, audio_file, stats_db):
18
+ openai.api_key = api_key
19
+ transcript = ""
20
+
21
+ with BytesIO(audio_file.read()) as audio_bytes:
22
+ # Get the extension of the uploaded file
23
+ file_extension = os.path.splitext(audio_file.name)[-1]
24
+
25
+ # Create a temporary file with the uploaded audio data and the correct extension
26
+ with tempfile.NamedTemporaryFile(delete=True, suffix=file_extension) as temp_audio_file:
27
+ temp_audio_file.write(audio_bytes.read())
28
+ temp_audio_file.seek(0) # Move the file pointer to the beginning of the file
29
+
30
+ # Transcribe the temporary audio file
31
+ if st.secrets.self_hosted == "false":
32
+ add_usage(stats_db, "embedding", "audio", metadata={"file_name": audio_file.name,"file_type": file_extension})
33
+
34
+ transcript = openai.Audio.translate("whisper-1", temp_audio_file)
35
+
36
+ return transcript
37
+
38
+ def process_audio(vector_store, file_name, stats_db):
39
+ if st.secrets.self_hosted == "false":
40
+ if file_name.size > 10000000:
41
+ st.error("File size is too large. Please upload a file smaller than 1MB.")
42
+ return
43
+ file_sha = ""
44
+ dateshort = time.strftime("%Y%m%d-%H%M%S")
45
+ file_meta_name = f"audiotranscript_{dateshort}.txt"
46
+ openai_api_key = st.secrets["openai_api_key"]
47
+ transcript = _transcribe_audio(openai_api_key, file_name, stats_db)
48
+ file_sha = compute_sha1_from_content(transcript.text.encode("utf-8"))
49
+ ## file size computed from transcript
50
+ file_size = len(transcript.text.encode("utf-8"))
51
+
52
+
53
+ ## Load chunk size and overlap from sidebar
54
+ chunk_size = st.session_state['chunk_size']
55
+ chunk_overlap = st.session_state['chunk_overlap']
56
+
57
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
58
+ texts = text_splitter.split_text(transcript.text)
59
+
60
+ docs_with_metadata = [Document(page_content=text, metadata={"file_sha1": file_sha,"file_size": file_size, "file_name": file_meta_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for text in texts]
61
+
62
+ if st.secrets.self_hosted == "false":
63
+ add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
64
+ vector_store.add_documents(docs_with_metadata)
65
+ return vector_store
loaders/common.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import time
3
+ import os
4
+ from utils import compute_sha1_from_file
5
+ from langchain.schema import Document
6
+ import streamlit as st
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from stats import add_usage
9
+
10
+ def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
11
+ documents = []
12
+ file_name = file.name
13
+ file_size = file.size
14
+ if st.secrets.self_hosted == "false":
15
+ if file_size > 1000000:
16
+ st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
17
+ return
18
+
19
+ dateshort = time.strftime("%Y%m%d")
20
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
21
+ tmp_file.write(file.getvalue())
22
+ tmp_file.flush()
23
+
24
+ loader = loader_class(tmp_file.name)
25
+ documents = loader.load()
26
+ file_sha1 = compute_sha1_from_file(tmp_file.name)
27
+
28
+ os.remove(tmp_file.name)
29
+
30
+ chunk_size = st.session_state['chunk_size']
31
+ chunk_overlap = st.session_state['chunk_overlap']
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
34
+
35
+ documents = text_splitter.split_documents(documents)
36
+
37
+ # Add the document sha1 as metadata to each document
38
+ docs_with_metadata = [Document(page_content=doc.page_content, metadata={"file_sha1": file_sha1,"file_size":file_size ,"file_name": file_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort}) for doc in documents]
39
+
40
+ vector_store.add_documents(docs_with_metadata)
41
+ if stats_db:
42
+ add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,"file_type": file_suffix, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})
loaders/csv.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders.csv_loader import CSVLoader
3
+
4
+ def process_csv(vector_store, file,stats_db):
5
+ return process_file(vector_store, file, CSVLoader, ".csv",stats_db=stats_db)
loaders/docx.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import Docx2txtLoader
3
+
4
+ def process_docx(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, Docx2txtLoader, ".docx", stats_db=stats_db)
loaders/html.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredHTMLLoader
3
+ import requests
4
+ import re
5
+ import unicodedata
6
+ import tempfile
7
+ import os
8
+ import streamlit as st
9
+ from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile
10
+
11
+ def process_html(vector_store, file, stats_db):
12
+ return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)
13
+
14
+
15
+ def get_html(url):
16
+ response = requests.get(url)
17
+ if response.status_code == 200:
18
+ return response.text
19
+ else:
20
+ return None
21
+
22
+ def create_html_file(url, content):
23
+ file_name = slugify(url) + ".html"
24
+ temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
25
+ with open(temp_file_path, 'w') as temp_file:
26
+ temp_file.write(content)
27
+
28
+ record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
29
+ uploaded_file = UploadedFile(record)
30
+
31
+ return uploaded_file, temp_file_path
32
+
33
+ def delete_tempfile(temp_file_path, url, ret):
34
+ try:
35
+ os.remove(temp_file_path)
36
+ if ret:
37
+ st.write(f"✅ Content saved... {url} ")
38
+ except OSError as e:
39
+ print(f"Error while deleting the temporary file: {str(e)}")
40
+ if ret:
41
+ st.write(f"❌ Error while saving content... {url} ")
42
+
43
+ def slugify(text):
44
+ text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
45
+ text = re.sub(r'[^\w\s-]', '', text).strip().lower()
46
+ text = re.sub(r'[-\s]+', '-', text)
47
+ return text
loaders/markdown.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredMarkdownLoader
3
+
4
+ def process_markdown(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, UnstructuredMarkdownLoader, ".md", stats_db=stats_db)
loaders/pdf.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import PyPDFLoader
3
+
4
+
5
+ def process_pdf(vector_store, file, stats_db):
6
+ return process_file(vector_store, file, PyPDFLoader, ".pdf", stats_db=stats_db)
loaders/powerpoint.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import UnstructuredPowerPointLoader
3
+
4
+ def process_powerpoint(vector_store, file, stats_db):
5
+ return process_file(vector_store, file, UnstructuredPowerPointLoader, ".pptx", stats_db=stats_db)
loaders/txt.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .common import process_file
2
+ from langchain.document_loaders import TextLoader
3
+
4
+ def process_txt(vector_store, file,stats_db):
5
+ return process_file(vector_store, file, TextLoader, ".txt", stats_db=stats_db)
main.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ import os
3
+ import tempfile
4
+
5
+ import streamlit as st
6
+ from files import file_uploader, url_uploader
7
+ from question import chat_with_doc
8
+ from brain import brain
9
+ from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
10
+ from langchain.vectorstores import SupabaseVectorStore
11
+ from supabase import Client, create_client
12
+ from explorer import view_document
13
+ from stats import get_usage_today
14
+
15
+ supabase_url = st.secrets.supabase_url
16
+ supabase_key = st.secrets.supabase_service_key
17
+ openai_api_key = st.secrets.openai_api_key
18
+ anthropic_api_key = st.secrets.anthropic_api_key
19
+ hf_api_key = st.secrets.hf_api_key
20
+ supabase: Client = create_client(supabase_url, supabase_key)
21
+ self_hosted = st.secrets.self_hosted
22
+
23
+ # embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
24
+
25
+
26
+ embeddings = HuggingFaceInferenceAPIEmbeddings(
27
+ api_key=hf_api_key,
28
+ model_name="BAAI/bge-large-en-v1.5"
29
+ )
30
+
31
+ vector_store = SupabaseVectorStore(supabase, embeddings, query_name='match_documents', table_name="documents")
32
+
33
+ models = ["llama-2"]
34
+
35
+ if openai_api_key:
36
+ models += ["gpt-3.5-turbo", "gpt-4"]
37
+
38
+ if anthropic_api_key:
39
+ models += ["claude-v1", "claude-v1.3",
40
+ "claude-instant-v1-100k", "claude-instant-v1.1-100k"]
41
+
42
+ # Set the theme
43
+ st.set_page_config(
44
+ page_title="meraKB",
45
+ layout="wide",
46
+ initial_sidebar_state="expanded",
47
+ )
48
+
49
+
50
+ st.title("🧠 meraKB - Your digital brain 🧠")
51
+ st.markdown("Store your knowledge in a vector store and chat with it.")
52
+ if self_hosted == "false":
53
+ st.markdown('**📢 Note: In the public demo, access to functionality is restricted. You can only use the GPT-3.5-turbo model and upload files up to 1Mb. To use more models and upload larger files, consider self-hosting meraKB.**')
54
+
55
+ st.markdown("---\n\n")
56
+
57
+ st.session_state["overused"] = False
58
+ if self_hosted == "false":
59
+ usage = get_usage_today(supabase)
60
+ if usage > st.secrets.usage_limit:
61
+ st.markdown(
62
+ f"<span style='color:red'>You have used {usage} tokens today, which is more than your daily limit of {st.secrets.usage_limit} tokens. Please come back later or consider self-hosting.</span>", unsafe_allow_html=True)
63
+ st.session_state["overused"] = True
64
+ else:
65
+ st.markdown(f"<span style='color:blue'>Usage today: {usage} tokens out of {st.secrets.usage_limit}</span>", unsafe_allow_html=True)
66
+ st.write("---")
67
+
68
+
69
+
70
+
71
+ # Initialize session state variables
72
+ if 'model' not in st.session_state:
73
+ st.session_state['model'] = "llama-2"
74
+ if 'temperature' not in st.session_state:
75
+ st.session_state['temperature'] = 0.1
76
+ if 'chunk_size' not in st.session_state:
77
+ st.session_state['chunk_size'] = 500
78
+ if 'chunk_overlap' not in st.session_state:
79
+ st.session_state['chunk_overlap'] = 0
80
+ if 'max_tokens' not in st.session_state:
81
+ st.session_state['max_tokens'] = 500
82
+
83
+ # Create a radio button for user to choose between adding knowledge or asking a question
84
+ user_choice = st.radio(
85
+ "Choose an action", ('Add Knowledge', 'Chat with your Brain', 'Forget', "Explore"))
86
+
87
+ st.markdown("---\n\n")
88
+
89
+ if user_choice == 'Add Knowledge':
90
+ # Display chunk size and overlap selection only when adding knowledge
91
+ st.sidebar.title("Configuration")
92
+ st.sidebar.markdown(
93
+ "Choose your chunk size and overlap for adding knowledge.")
94
+ st.session_state['chunk_size'] = st.sidebar.slider(
95
+ "Select Chunk Size", 100, 1000, st.session_state['chunk_size'], 50)
96
+ st.session_state['chunk_overlap'] = st.sidebar.slider(
97
+ "Select Chunk Overlap", 0, 100, st.session_state['chunk_overlap'], 10)
98
+
99
+ # Create two columns for the file uploader and URL uploader
100
+ col1, col2 = st.columns(2)
101
+
102
+ with col1:
103
+ file_uploader(supabase, vector_store)
104
+ with col2:
105
+ url_uploader(supabase, vector_store)
106
+ elif user_choice == 'Chat with your Brain':
107
+ # Display model and temperature selection only when asking questions
108
+ st.sidebar.title("Configuration")
109
+ st.sidebar.markdown(
110
+ "Choose your model and temperature for asking questions.")
111
+ if self_hosted != "false":
112
+ st.session_state['model'] = st.sidebar.selectbox(
113
+ "Select Model", models, index=(models).index(st.session_state['model']))
114
+ else:
115
+ st.sidebar.write("**Model**: gpt-3.5-turbo")
116
+ st.sidebar.write("**Self Host to unlock more models such as claude-v1 and GPT4**")
117
+ st.session_state['model'] = "gpt-3.5-turbo"
118
+ st.session_state['temperature'] = st.sidebar.slider(
119
+ "Select Temperature", 0.1, 1.0, st.session_state['temperature'], 0.1)
120
+ if st.secrets.self_hosted != "false":
121
+ st.session_state['max_tokens'] = st.sidebar.slider(
122
+ "Select Max Tokens", 500, 4000, st.session_state['max_tokens'], 500)
123
+ else:
124
+ st.session_state['max_tokens'] = 500
125
+
126
+ chat_with_doc(st.session_state['model'], vector_store, stats_db=supabase)
127
+ elif user_choice == 'Forget':
128
+ st.sidebar.title("Configuration")
129
+
130
+ brain(supabase)
131
+ elif user_choice == 'Explore':
132
+ st.sidebar.title("Configuration")
133
+ view_document(supabase)
134
+
135
+ st.markdown("---\n\n")
question.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import anthropic
2
+ import streamlit as st
3
+ from streamlit.logger import get_logger
4
+ from langchain.chains import ConversationalRetrievalChain
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.llms import OpenAI
7
+ from langchain.llms import HuggingFaceEndpoint
8
+ from langchain.chat_models import ChatAnthropic
9
+ from langchain.vectorstores import SupabaseVectorStore
10
+ from stats import add_usage
11
+
12
+ memory = ConversationBufferMemory(
13
+ memory_key="chat_history", return_messages=True)
14
+ openai_api_key = st.secrets.openai_api_key
15
+ anthropic_api_key = st.secrets.anthropic_api_key
16
+ hf_api_key = st.secrets.hf_api_key
17
+ logger = get_logger(__name__)
18
+
19
+
20
+ def count_tokens(question, model):
21
+ count = f'Words: {len(question.split())}'
22
+ if model.startswith("claude"):
23
+ count += f' | Tokens: {anthropic.count_tokens(question)}'
24
+ return count
25
+
26
+
27
+ def chat_with_doc(model, vector_store: SupabaseVectorStore, stats_db):
28
+
29
+ if 'chat_history' not in st.session_state:
30
+ st.session_state['chat_history'] = []
31
+
32
+
33
+
34
+ question = st.text_area("## Ask a question")
35
+ columns = st.columns(3)
36
+ with columns[0]:
37
+ button = st.button("Ask")
38
+ with columns[1]:
39
+ count_button = st.button("Count Tokens", type='secondary')
40
+ with columns[2]:
41
+ clear_history = st.button("Clear History", type='secondary')
42
+
43
+
44
+
45
+ if clear_history:
46
+ # Clear memory in Langchain
47
+ memory.clear()
48
+ st.session_state['chat_history'] = []
49
+ st.experimental_rerun()
50
+
51
+ if button:
52
+ qa = None
53
+ if not st.session_state["overused"]:
54
+ add_usage(stats_db, "chat", "prompt" + question, {"model": model, "temperature": st.session_state['temperature']})
55
+ if model.startswith("gpt"):
56
+ logger.info('Using OpenAI model %s', model)
57
+ qa = ConversationalRetrievalChain.from_llm(
58
+ OpenAI(
59
+ model_name=st.session_state['model'], openai_api_key=openai_api_key, temperature=st.session_state['temperature'], max_tokens=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True)
60
+ elif anthropic_api_key and model.startswith("claude"):
61
+ logger.info('Using Anthropics model %s', model)
62
+ qa = ConversationalRetrievalChain.from_llm(
63
+ ChatAnthropic(
64
+ model=st.session_state['model'], anthropic_api_key=anthropic_api_key, temperature=st.session_state['temperature'], max_tokens_to_sample=st.session_state['max_tokens']), vector_store.as_retriever(), memory=memory, verbose=True, max_tokens_limit=102400)
65
+ elif hf_api_key and model.startswith("llama"):
66
+ logger.info('Using Llama model %s', model)
67
+ # print(st.session_state['max_tokens'])
68
+ endpoint_url = ("https://api-inference.huggingface.co/models/meta-llama/Llama-2-70b-chat-hf")
69
+ model_kwargs = {"temperature" : st.session_state['temperature'],
70
+ "max_new_tokens" : st.session_state['max_tokens'],
71
+ "return_full_text" : False}
72
+ hf = HuggingFaceEndpoint(
73
+ endpoint_url=endpoint_url,
74
+ task="text-generation",
75
+ huggingfacehub_api_token=hf_api_key,
76
+ model_kwargs=model_kwargs
77
+ )
78
+ qa = ConversationalRetrievalChain.from_llm(hf, retriever=vector_store.as_retriever(), memory=memory, verbose=True)
79
+
80
+ st.session_state['chat_history'].append(("You", question))
81
+
82
+ # Generate model's response and add it to chat history
83
+ model_response = qa({"question": question})
84
+ logger.info('Result: %s', model_response)
85
+
86
+ st.session_state['chat_history'].append(("meraKB", model_response["answer"]))
87
+
88
+ # Display chat history
89
+ st.empty()
90
+ for speaker, text in st.session_state['chat_history']:
91
+ st.markdown(f"**{speaker}:** {text}")
92
+ else:
93
+ st.error("You have used all your free credits. Please try again later or self host.")
94
+
95
+ if count_button:
96
+ st.write(count_tokens(question, model))
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ Markdown==3.4.3
3
+ openai==0.27.6
4
+ pdf2image==1.16.3
5
+ pypdf==3.8.1
6
+ streamlit==1.22.0
7
+ StrEnum==0.4.10
8
+ supabase==1.0.3
9
+ tiktoken==0.4.0
10
+ unstructured==0.6.5
11
+ anthropic==0.2.8
12
+ fastapi==0.95.2
13
+ python-multipart==0.0.6
14
+ uvicorn==0.22.0
15
+ docx2txt
sidebar.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def sidebar(supabase):
5
+ st.sidebar.title("Database Information")
6
+ number_of_docs = number_of_documents(supabase)
7
+ st.sidebar.markdown(f"**Docs in DB:** {number_of_docs}")
8
+
9
+ def number_of_documents(supabase):
10
+ documents = supabase.table("documents").select("id", count="exact").execute()
11
+ return documents.count
stats.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timedelta
2
+
3
+ # -- Create a table called "stats"
4
+ # create table
5
+ # stats (
6
+ # -- A column called "time" with data type "timestamp"
7
+ # time timestamp,
8
+ # -- A column called "details" with data type "text"
9
+ # chat boolean,
10
+ # embedding boolean,
11
+ # details text,
12
+ # metadata jsonb,
13
+ # -- An "integer" primary key column called "id" that is generated always as identity
14
+ # id integer primary key generated always as identity
15
+ # );
16
+
17
+
18
+ def get_usage_today(supabase):
19
+ # Returns the number of rows in the stats table for the last 24 hours
20
+ response = supabase.table("stats").select("id", count="exact").gte("time", datetime.now() - timedelta(hours=24)).execute()
21
+ return response.count
22
+
23
+ def add_usage(supabase, type, details, metadata):
24
+ # Adds a row to the stats table
25
+ supabase.table("stats").insert({
26
+ "time": datetime.now().isoformat(),
27
+ "chat": type == "chat",
28
+ "embedding": type == "embedding",
29
+ "details": details,
30
+ "metadata": metadata
31
+ }).execute()
utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+
3
+ def compute_sha1_from_file(file_path):
4
+ with open(file_path, "rb") as file:
5
+ bytes = file.read()
6
+ readable_hash = compute_sha1_from_content(bytes)
7
+ return readable_hash
8
+
9
+ def compute_sha1_from_content(content):
10
+ readable_hash = hashlib.sha1(content).hexdigest()
11
+ return readable_hash