garyg-ai commited on
Commit
b2c7f9f
Β·
1 Parent(s): b7c7777
Files changed (5) hide show
  1. .chainlit/config.toml +84 -0
  2. .gitignore +160 -0
  3. app.py +235 -0
  4. chainlit.md +14 -0
  5. requirements.txt +12 -0
.chainlit/config.toml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+ # List of environment variables to be provided by each user to use the app.
6
+ user_env = []
7
+
8
+ # Duration (in seconds) during which the session is saved when the connection is lost
9
+ session_timeout = 3600
10
+
11
+ # Enable third parties caching (e.g LangChain cache)
12
+ cache = false
13
+
14
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
+ # follow_symlink = false
16
+
17
+ [features]
18
+ # Show the prompt playground
19
+ prompt_playground = true
20
+
21
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
22
+ unsafe_allow_html = false
23
+
24
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
25
+ latex = false
26
+
27
+ # Authorize users to upload files with messages
28
+ multi_modal = true
29
+
30
+ # Allows user to use speech to text
31
+ [features.speech_to_text]
32
+ enabled = false
33
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
34
+ # language = "en-US"
35
+
36
+ [UI]
37
+ # Name of the app and chatbot.
38
+ name = "Chatbot"
39
+
40
+ # Show the readme while the conversation is empty.
41
+ show_readme_as_default = true
42
+
43
+ # Description of the app and chatbot. This is used for HTML tags.
44
+ # description = ""
45
+
46
+ # Large size content are by default collapsed for a cleaner ui
47
+ default_collapse_content = true
48
+
49
+ # The default value for the expand messages settings.
50
+ default_expand_messages = false
51
+
52
+ # Hide the chain of thought details from the user in the UI.
53
+ hide_cot = false
54
+
55
+ # Link to your github repo. This will add a github button in the UI's header.
56
+ # github = ""
57
+
58
+ # Specify a CSS file that can be used to customize the user interface.
59
+ # The CSS file can be served from the public directory or via an external link.
60
+ # custom_css = "/public/test.css"
61
+
62
+ # Override default MUI light theme. (Check theme.ts)
63
+ [UI.theme.light]
64
+ #background = "#FAFAFA"
65
+ #paper = "#FFFFFF"
66
+
67
+ [UI.theme.light.primary]
68
+ #main = "#F80061"
69
+ #dark = "#980039"
70
+ #light = "#FFE7EB"
71
+
72
+ # Override default MUI dark theme. (Check theme.ts)
73
+ [UI.theme.dark]
74
+ #background = "#FAFAFA"
75
+ #paper = "#FFFFFF"
76
+
77
+ [UI.theme.dark.primary]
78
+ #main = "#F80061"
79
+ #dark = "#980039"
80
+ #light = "#FFE7EB"
81
+
82
+
83
+ [meta]
84
+ generated_by = "0.7.700"
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+ *wandb*
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import openai
3
+
4
+ import chainlit as cl # importing chainlit for our app
5
+ from chainlit.prompt import Prompt, PromptMessage # importing prompt tools
6
+
7
+ import os
8
+ import getpass
9
+
10
+
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ os.environ["PINECONE_ENV"] = "gcp-starter"
16
+
17
+
18
+ import arxiv
19
+
20
+ arxiv_client = arxiv.Client()
21
+ paper_urls = []
22
+
23
+
24
+ ''''
25
+ search = arxiv.Search(
26
+ query = "Retrieval Augmented Generation",
27
+ max_results = 5,
28
+ sort_by = arxiv.SortCriterion.Relevance
29
+ )
30
+
31
+ for result in arxiv_client.results(search):
32
+ paper_urls.append(result.pdf_url)
33
+
34
+ print(paper_urls)
35
+
36
+ '''
37
+
38
+
39
+
40
+
41
+ from langchain.document_loaders import PyPDFLoader
42
+
43
+ docs = []
44
+
45
+ ''''
46
+ for paper_url in paper_urls:
47
+ loader = PyPDFLoader(paper_url)
48
+ docs.append(loader.load())
49
+
50
+ print(docs[0][6])
51
+ '''
52
+
53
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
54
+
55
+
56
+ text_splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size = 1000,
58
+ chunk_overlap = 100,
59
+ length_function = len
60
+ )
61
+
62
+
63
+
64
+
65
+
66
+
67
+ import pinecone
68
+ from pinecone.core.client.configuration import Configuration as OpenApiConfiguration
69
+
70
+ YOUR_API_KEY = os.environ["PINECONE_API_KEY"]
71
+ YOUR_ENV = os.environ["PINECONE_ENV"]
72
+
73
+ index_name = 'arxiv-paper-index2'
74
+
75
+
76
+
77
+ pinecone.init(
78
+ api_key=YOUR_API_KEY,
79
+ environment=YOUR_ENV
80
+ )
81
+
82
+ if index_name not in pinecone.list_indexes():
83
+ # we create a new index
84
+ pinecone.create_index(
85
+ name=index_name,
86
+ metric='cosine',
87
+ dimension=1536
88
+ )
89
+
90
+ index = pinecone.GRPCIndex(index_name)
91
+
92
+
93
+
94
+
95
+
96
+ from langchain.embeddings.openai import OpenAIEmbeddings
97
+ from langchain.embeddings import CacheBackedEmbeddings
98
+ from langchain.storage import LocalFileStore
99
+
100
+ store = LocalFileStore("./cache/")
101
+
102
+ core_embeddings_model = OpenAIEmbeddings()
103
+
104
+ embedder = CacheBackedEmbeddings.from_bytes_store(
105
+ core_embeddings_model,
106
+ store,
107
+ namespace=core_embeddings_model.model
108
+ )
109
+
110
+
111
+
112
+ from tqdm.auto import tqdm
113
+ from uuid import uuid4
114
+
115
+ BATCH_LIMIT = 100
116
+
117
+ texts = []
118
+ metadatas = []
119
+
120
+ ''''
121
+ for i in tqdm(range(len(docs))):
122
+ for doc in docs[i]:
123
+ metadata = {
124
+ 'source_document' : doc.metadata["source"],
125
+ 'page_number' : doc.metadata["page"]
126
+ }
127
+
128
+ record_texts = text_splitter.split_text(doc.page_content)
129
+
130
+ record_metadatas = [{
131
+ "chunk": j, "text": text, **metadata
132
+ } for j, text in enumerate(record_texts)]
133
+ texts.extend(record_texts)
134
+ metadatas.extend(record_metadatas)
135
+ if len(texts) >= BATCH_LIMIT:
136
+ ids = [str(uuid4()) for _ in range(len(texts))]
137
+ embeds = embedder.embed_documents(texts)
138
+ index.upsert(vectors=zip(ids, embeds, metadatas))
139
+ texts = []
140
+ metadatas = []
141
+
142
+ if len(texts) > 0:
143
+ ids = [str(uuid4()) for _ in range(len(texts))]
144
+ embeds = embedder.embed_documents(texts)
145
+ index.upsert(vectors=zip(ids, embeds, metadatas))
146
+ '''
147
+
148
+ from langchain.vectorstores import Pinecone
149
+
150
+ text_field = "text"
151
+
152
+ index = pinecone.Index(index_name)
153
+
154
+ vectorstore = Pinecone(
155
+ index,
156
+ embedder.embed_query,
157
+ text_field
158
+ )
159
+
160
+ ''''
161
+ query = "What is dense vector retrieval?"
162
+ '''
163
+
164
+
165
+ ''''
166
+ vectorstore.similarity_search(
167
+ query,
168
+ k=3
169
+ )
170
+ '''
171
+
172
+ from langchain.chat_models import ChatOpenAI
173
+
174
+ llm = ChatOpenAI(
175
+ model="gpt-3.5-turbo",
176
+ temperature=0
177
+ )
178
+
179
+ from langchain.prompts import ChatPromptTemplate
180
+
181
+ system_template = """Answer the following question with the provided context only in the voice of hulk hogan. If you aren't able to get the answer from that, then please don't answer the question.
182
+
183
+ ### CONTEXT
184
+ {context}
185
+
186
+ ###QUESTION
187
+ {question}
188
+ """
189
+
190
+
191
+ retriever = vectorstore.as_retriever()
192
+
193
+
194
+ from langchain.prompts import ChatPromptTemplate
195
+
196
+ prompt = ChatPromptTemplate.from_template(system_template)
197
+
198
+
199
+ from operator import itemgetter
200
+ from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
201
+ from langchain.schema import format_document
202
+ from langchain.schema.output_parser import StrOutputParser
203
+ from langchain.prompts.prompt import PromptTemplate
204
+
205
+ retrieval_augmented_qa_chain = (
206
+ {"context": itemgetter("question") | retriever,
207
+ "question": itemgetter("question")
208
+ }
209
+ | RunnablePassthrough.assign(
210
+ context=itemgetter("context")
211
+ )
212
+ | {
213
+ "response": prompt | llm,
214
+ "context": itemgetter("context"),
215
+ }
216
+ )
217
+
218
+
219
+ import langchain
220
+ from langchain.cache import InMemoryCache
221
+ from langchain.globals import set_llm_cache
222
+ set_llm_cache(InMemoryCache())
223
+
224
+
225
+
226
+
227
+ @cl.on_chat_start
228
+ async def on_chat_start():
229
+ print("starting up")
230
+
231
+ @cl.on_message
232
+ async def on_message(message: cl.Message):
233
+
234
+ await (cl.Message(content=retrieval_augmented_qa_chain.invoke({"question":message.content})).send())
235
+
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! πŸš€πŸ€–
2
+
3
+ Hi there, Developer! πŸ‘‹ We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links πŸ”—
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) πŸ“š
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! πŸ’¬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! πŸ’»πŸ˜Š
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlit==0.7.700
2
+ cohere==4.37
3
+ openai==1.3.5
4
+ tiktoken==0.5.1
5
+ python-dotenv==1.0.0
6
+ openai
7
+ langchain
8
+ arxiv
9
+ ipywidgets
10
+ wandb
11
+ pypdf
12
+ pinecone-client==grpc