Spaces:

jeevanions
/

LCELRag

Paused

App Files Files Community

jeevan commited on Aug 31, 2024

Commit

4a0c158

0 Parent(s):

Locally working lcel rag

Browse files

Files changed (7) hide show

.chainlit/config.toml +84 -0
.gitignore +8 -0
Chunking.py +68 -0
Dockerfile +9 -0
app.py +125 -0
chainlit.md +3 -0
requirements.txt +15 -0

.chainlit/config.toml ADDED Viewed

	@@ -0,0 +1,84 @@

+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+[features]
+# Show the prompt playground
+prompt_playground = true
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+# Authorize users to upload files with messages
+multi_modal = true
+# Allows user to use speech to text
+[features.speech_to_text]
+    enabled = false
+    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
+    # language = "en-US"
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+# The default value for the expand messages settings.
+default_expand_messages = false
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+# Link to your github repo. This will add a github button in the UI's header.
+# github = ""
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+[meta]
+generated_by = "0.7.700"

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+venv/*
+.env
+__pycache__/app.cpython-39.pyc
+__pycache__/app.cpython-311.pyc
+__pycache__/Chunking.cpython-39.pyc
+__pycache__/Chunking.cpython-311.pyc
+.vscode/launch.json
+.vscode/settings.json

Chunking.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from enum import Enum
+from langchain_community.document_loaders import PyPDFLoader,TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter
+separators=[
+        "\n\n",
+        "\n",
+        " ",
+        ".",
+        ",",
+        "\u200b",  # Zero-width space
+        "\uff0c",  # Fullwidth comma
+        "\u3001",  # Ideographic comma
+        "\uff0e",  # Fullwidth full stop
+        "\u3002",  # Ideographic full stop
+        "",
+    ]
+class ChunkingStrategy(Enum):
+    RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter"
+    NLTK_TEXT_SPLITTER = "nltk_text_splitter"
+    SPACY_TEXT_SPLITTER = "spacy_text_splitter"
+class TextLoaderAndSplitterWrapper:
+    def __init__(self, strategy: ChunkingStrategy, file_path:str):
+        # Defaults
+        self.splitter = None
+        self.documents = []
+        # Determine with splitter strategy to use from parameter
+        if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER:
+            self.splitter = RecursiveCharacterTextSplitter(separators=separators)
+        elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER:
+            self.splitter = NLTKTextSplitter()
+        elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER:
+            self.splitter = SpacyTextSplitter()
+        else:
+            raise ValueError(f"Unknown strategy: {strategy}")
+        # Load the document and chunk it
+        self.file_path = file_path
+    def load_documents(self):
+        if self.file_path.endswith(".pdf"):
+            # Use PDF loader
+            pdf_loader = PyPDFLoader(self.file_path)
+            self.documents =  pdf_loader.load_and_split(text_splitter=self.splitter) #  Defaults to RecursiveCharacterTextSplitter.
+            return self.documents
+        elif self.file_path.endswith(".txt"):
+            # Use Text loader
+            text_loader = TextLoader(self.file_path)
+            self.documents = text_loader.load_and_split(text_splitter=self.splitter)
+            return self.documents
+        else:
+            raise ValueError(f"Unknown file type: {self.file_path}")
+    def split(self, text: str):
+        return self.splitter.split(text)
+    def join(self, chunks: list):
+        return self.splitter.join(chunks)
+    def __str__(self):
+        return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})"
+    def __repr__(self):
+        return str(self)

Dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+RUN pip install -r requirements.txt
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+from typing import List
+from operator import itemgetter
+from Chunking import ChunkingStrategy, TextLoaderAndSplitterWrapper
+from langchain.schema.runnable import RunnablePassthrough
+from langchain_openai import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_community.vectorstores import Qdrant
+import chainlit as cl
+from chainlit.types import AskFileResponse
+from chainlit.cli import run_chainlit
+import tempfile
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+GPT_MODEL = "gpt-4o-mini"
+# Utility functions
+def save_file(file: AskFileResponse,file_ext:str) -> str:
+    if file_ext == "application/pdf":
+        file_ext = ".pdf"
+    elif file_ext == "text/plain":
+        file_ext = ".txt"
+    else:
+        raise ValueError(f"Unknown file type: {file_ext}")
+    with tempfile.NamedTemporaryFile(
+        mode="wb", delete=False, suffix=file_ext
+    ) as temp_file:
+        temp_file_path = temp_file.name
+        temp_file.write(file.content)
+    return temp_file_path
+# Prepare the components that will form the chain
+## Step 1: Create a prompt template
+base_rag_prompt_template = """\
+You are a helpful assistant that can answer questions related to the provided context. Repond I don't have that information if outside context.
+Context:
+{context}
+Question:
+{question}
+"""
+base_rag_prompt = ChatPromptTemplate.from_template(base_rag_prompt_template)
+## Step 2: Create Embeddings model instance for creating embeddings
+embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+## Step 2: Create the OpenAI chat model
+base_llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"])
+@cl.on_chat_start
+async def on_chat_start():
+    msg = cl.Message(content="Welcome to the Chat with Files app powered by LCEL and OpenAI - RAG!")
+    await msg.send()
+    files = None
+    documents = None
+    # Wait for the user to upload a file
+    while files == None:
+        files = await cl.AskFileMessage(
+            content="Please upload a text or a pdf file to begin!",
+            accept=["text/plain", "application/pdf"],
+            max_size_mb=10,
+            max_files=1,
+            timeout=180,
+        ).send()
+    ## Load file and split into chunks
+    msg = cl.Message(content=f"Processing `{files[0].name}`...")
+    await msg.send()
+    current_file_path = save_file(files[0], files[0].type)
+    loader_splitter = TextLoaderAndSplitterWrapper(ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER, current_file_path)
+    documents = loader_splitter.load_documents()
+    ## Vectorising the documents
+    qdrant_vectorstore = Qdrant.from_documents(
+        documents=documents,
+        embedding=embedding_model,
+        location=":memory:"
+    )
+    qdrant_retriever = qdrant_vectorstore.as_retriever()
+    # create the chain on new chart session
+    retrieval_augmented_qa_chain = (
+        # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
+        # "question" : populated by getting the value of the "question" key
+        # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
+        {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
+        # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
+        #              by getting the value of the "context" key from the previous step
+        | RunnablePassthrough.assign(context=itemgetter("context"))
+        # "response" : the "context" and "question" values are used to format our prompt object and then piped
+        #              into the LLM and stored in a key called "response"
+        # "context"  : populated by getting the value of the "context" key from the previous step
+        | {"response": base_rag_prompt | base_llm, "context": itemgetter("context")}
+    )
+    # Let the user know that the system is ready
+    msg = cl.Message(content=f"Processing `{files[0].name}` done. You can now ask questions!")
+    await msg.send()
+    cl.user_session.set("chain", retrieval_augmented_qa_chain)
+@cl.on_message
+async def main(message: cl.Message):
+    chain = cl.user_session.get("chain")
+    msg = cl.Message(content="")
+    response = chain.invoke({"question": message.content})
+    msg.content= response["response"].content
+    await msg.send()
+    cl.user_session.set("chain", chain)
+if __name__ == "__main__":
+    run_chainlit(__file__)

chainlit.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Welcome to Chat with Your Text File
2	+
3	+ With this application, you can chat with an uploaded text file that is smaller than 2MB!

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+langsmith
+langchain_core
+langchain_openai
+langchain_community
+langchain-text-splitters
+langchain-qdrant
+qdrant-client
+openai
+tiktoken
+cohere
+lxml
+pymupdf
+pypdf
+chainlit==0.7.700