Spaces:

garyg-ai
/

raqa-arxiv-app

Runtime error

App Files Files Community

garyg-ai commited on Dec 12, 2023

Commit

b2c7f9f

1 Parent(s): b7c7777

adding

Browse files

Files changed (5) hide show

.chainlit/config.toml +84 -0
.gitignore +160 -0
app.py +235 -0
chainlit.md +14 -0
requirements.txt +12 -0

.chainlit/config.toml ADDED Viewed

	@@ -0,0 +1,84 @@

+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+[features]
+# Show the prompt playground
+prompt_playground = true
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+# Authorize users to upload files with messages
+multi_modal = true
+# Allows user to use speech to text
+[features.speech_to_text]
+    enabled = false
+    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
+    # language = "en-US"
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+# The default value for the expand messages settings.
+default_expand_messages = false
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+# Link to your github repo. This will add a github button in the UI's header.
+# github = ""
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+[meta]
+generated_by = "0.7.700"

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+*wandb*
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import asyncio
+import openai
+import chainlit as cl  # importing chainlit for our app
+from chainlit.prompt import Prompt, PromptMessage  # importing prompt tools
+import os
+import getpass
+from dotenv import load_dotenv
+load_dotenv()
+os.environ["PINECONE_ENV"] = "gcp-starter"
+import arxiv
+arxiv_client = arxiv.Client()
+paper_urls = []
+''''
+search = arxiv.Search(
+  query = "Retrieval Augmented Generation",
+  max_results = 5,
+  sort_by = arxiv.SortCriterion.Relevance
+)
+for result in arxiv_client.results(search):
+  paper_urls.append(result.pdf_url)
+print(paper_urls)
+'''
+from langchain.document_loaders import PyPDFLoader
+docs = []
+''''
+for paper_url in paper_urls:
+  loader = PyPDFLoader(paper_url)
+  docs.append(loader.load())
+print(docs[0][6])
+'''
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size = 1000,
+    chunk_overlap = 100,
+    length_function = len
+)
+import pinecone
+from pinecone.core.client.configuration import Configuration as OpenApiConfiguration
+YOUR_API_KEY = os.environ["PINECONE_API_KEY"]
+YOUR_ENV = os.environ["PINECONE_ENV"]
+index_name = 'arxiv-paper-index2'
+pinecone.init(
+    api_key=YOUR_API_KEY,
+    environment=YOUR_ENV
+)
+if index_name not in pinecone.list_indexes():
+    # we create a new index
+    pinecone.create_index(
+        name=index_name,
+        metric='cosine',
+        dimension=1536
+    )
+index = pinecone.GRPCIndex(index_name)
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings import CacheBackedEmbeddings
+from langchain.storage import LocalFileStore
+store = LocalFileStore("./cache/")
+core_embeddings_model = OpenAIEmbeddings()
+embedder = CacheBackedEmbeddings.from_bytes_store(
+    core_embeddings_model,
+    store,
+    namespace=core_embeddings_model.model
+)
+from tqdm.auto import tqdm
+from uuid import uuid4
+BATCH_LIMIT = 100
+texts = []
+metadatas = []
+''''
+for i in tqdm(range(len(docs))):
+  for doc in docs[i]:
+    metadata = {
+        'source_document' : doc.metadata["source"],
+        'page_number' : doc.metadata["page"]
+    }
+    record_texts = text_splitter.split_text(doc.page_content)
+    record_metadatas = [{
+        "chunk": j, "text": text, **metadata
+    } for j, text in enumerate(record_texts)]
+    texts.extend(record_texts)
+    metadatas.extend(record_metadatas)
+    if len(texts) >= BATCH_LIMIT:
+        ids = [str(uuid4()) for _ in range(len(texts))]
+        embeds = embedder.embed_documents(texts)
+        index.upsert(vectors=zip(ids, embeds, metadatas))
+        texts = []
+        metadatas = []
+if len(texts) > 0:
+    ids = [str(uuid4()) for _ in range(len(texts))]
+    embeds = embedder.embed_documents(texts)
+    index.upsert(vectors=zip(ids, embeds, metadatas))
+'''
+from langchain.vectorstores import Pinecone
+text_field = "text"
+index = pinecone.Index(index_name)
+vectorstore = Pinecone(
+    index,
+    embedder.embed_query,
+    text_field
+)
+''''
+query = "What is dense vector retrieval?"
+'''
+''''
+vectorstore.similarity_search(
+    query,
+    k=3
+)
+'''
+from langchain.chat_models import ChatOpenAI
+llm = ChatOpenAI(
+    model="gpt-3.5-turbo",
+    temperature=0
+)
+from langchain.prompts import ChatPromptTemplate
+system_template = """Answer the following question with the provided context only in the voice of hulk hogan. If you aren't able to get the answer from that, then please don't answer the question.
+### CONTEXT
+{context}
+###QUESTION
+{question}
+"""
+retriever = vectorstore.as_retriever()
+from langchain.prompts import ChatPromptTemplate
+prompt = ChatPromptTemplate.from_template(system_template)
+from operator import itemgetter
+from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
+from langchain.schema import format_document
+from langchain.schema.output_parser import StrOutputParser
+from langchain.prompts.prompt import PromptTemplate
+retrieval_augmented_qa_chain = (
+    {"context": itemgetter("question") | retriever,
+     "question": itemgetter("question")
+    }
+    | RunnablePassthrough.assign(
+        context=itemgetter("context")
+      )
+    | {
+         "response": prompt  | llm,
+         "context": itemgetter("context"),
+      }
+)
+import langchain
+from langchain.cache import InMemoryCache
+from langchain.globals import set_llm_cache
+set_llm_cache(InMemoryCache())
+@cl.on_chat_start
+async def on_chat_start():
+    print("starting up")
+@cl.on_message
+async def on_message(message: cl.Message):
+    await (cl.Message(content=retrieval_augmented_qa_chain.invoke({"question":message.content})).send())

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Welcome to Chainlit! 🚀🤖
+Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
+## Useful Links 🔗
+- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
+- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
+We can't wait to see what you create with Chainlit! Happy coding! 💻😊
+## Welcome screen
+To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+chainlit==0.7.700
+cohere==4.37
+openai==1.3.5
+tiktoken==0.5.1
+python-dotenv==1.0.0
+openai
+langchain
+arxiv
+ipywidgets
+wandb
+pypdf
+pinecone-client==grpc