Spaces:

jeevanions
/

PythonicRAG

Sleeping

File size: 4,846 Bytes

234eac0
249d2c8
637aeec
234eac0
 
249d2c8
 
234eac0
 
 
 
 
 
249d2c8
234eac0
 
249d2c8
234eac0
249d2c8
 
 
 
 
234eac0
 
 
 
 
 
 
 
 
 
 
249d2c8
 
234eac0
249d2c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234eac0
249d2c8
234eac0
 
249d2c8
637aeec
249d2c8
234eac0
637aeec
234eac0
 
 
249d2c8
 
637aeec
234eac0
 
249d2c8
637aeec
 
 
 
 
 
 
 
 
249d2c8
 
 
 
 
 
 
 
234eac0
 
 
249d2c8
 
 
 
 
 
234eac0
 
 
 
249d2c8
234eac0
249d2c8
 
 
 
234eac0
 
249d2c8
 
 
637aeec
249d2c8
637aeec
249d2c8
 
 
234eac0
 
 
249d2c8
234eac0
249d2c8
 
 
234eac0
249d2c8
234eac0
 
 
249d2c8
 
234eac0
249d2c8
234eac0
249d2c8
 
234eac0
 
 
 
 
249d2c8
 
 
 
 
234eac0
 
 
 
249d2c8
234eac0
 
249d2c8

import os
from openai import AsyncOpenAI
from RagPipeline import RetrievalAugmentedQAPipeline
from typing import List
from chainlit.types import AskFileResponse
from chainlit.cli import run_chainlit
from aimakerspace.text_utils import CharacterTextSplitter, PdfFileLoader, TextFileLoader
from aimakerspace.openai_utils.prompts import (
    UserRolePrompt,
    SystemRolePrompt,
    AssistantRolePrompt,
)
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase, VectorDatabaseOptions
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
import chainlit as cl
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Instrument the OpenAI client
# cl.instrument_openai()

##### Prompt Templates #####
system_template = """\
Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""

user_prompt_template = """\
Context:
{context}

Question:
{question}
"""

system_role_prompt = SystemRolePrompt(system_template)
user_role_prompt = UserRolePrompt(user_prompt_template)

### Text Chunking ###

# text_splitter = CharacterTextSplitter()
text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)

def process_text_file(file: AskFileResponse) -> List[str]:
    import tempfile

    with tempfile.NamedTemporaryFile(
        mode="wb", delete=False, suffix=".txt"
    ) as temp_file:
        temp_file_path = temp_file.name
        temp_file.write(file.content)

    text_loader = TextFileLoader(temp_file_path)
    documents = text_loader.load_documents()
    texts = []
    for doc in documents:
        texts += text_splitter.split_text(doc)
    return texts

def process_pdf_file(file: AskFileResponse) -> List[str]:
    import tempfile
    with tempfile.NamedTemporaryFile(
        mode="wb", delete=False, suffix=".pdf"
    ) as temp_file:
        temp_file_path = temp_file.name
        temp_file.write(file.content)


    pdf_loader = PdfFileLoader(temp_file_path)
    texts = pdf_loader.load_documents() # Also handles splitting the text in this case pages
    return texts

async def send_new_message(content, elemets=None):
    msg = cl.Message(content,elements=elemets)
    await msg.send()
    return msg


@cl.on_chat_start
async def on_chat_start():
    print("On Chat Start")
    # await send_new_message("Welcome to the Chat with Files app!")
    msg = cl.Message(content="Welcome to the Chat with Files app!")
    await msg.send()
    print("After First message")

    files = None

    # Wait for the user to upload a file
    while files == None:

        files = await cl.AskFileMessage(
            content="Please upload a text file to begin!",
            accept=["text/plain", "application/pdf"],
            max_size_mb=10,
            max_files=4,
            timeout=180,
        ).send()
    texts : List[str] = []
    for file in files:
        if file.type == "application/pdf":
            texts += process_pdf_file(file)
        if file.type == "text/plain":
            texts += process_text_file(file)

    # await send_new_message(content=f"Processing `{file.name}`...")
    msg = cl.Message(content=f"Processing `{file.name}`...")
    await msg.send()

    print(f"Processing {len(texts)} text chunks")
    
    # Create a dict vector store
    vector_db_options =VectorDatabaseOptions.QDRANT
    embedding_model = EmbeddingModel(embeddings_model_name= "text-embedding-3-small",dimensions=1000)
    vector_db = VectorDatabase(vector_db_options,embedding_model)
    vector_db = await vector_db.abuild_from_list(texts)

    chat_openai = ChatOpenAI()

    # Create a chain
    retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(system_role_prompt, user_role_prompt,
        vector_db_retriever=vector_db, llm=chat_openai
    )

    # Let the user know that the system is ready
    msg = cl.Message(content=f"Processing `{file.name}` done. You can now ask questions!")
    await msg.send()

    cl.user_session.set("chain", retrieval_augmented_qa_pipeline)


@cl.on_message
async def main(message: cl.Message):
    msg = cl.Message(content="on message")
    await msg.send()

    chain :RetrievalAugmentedQAPipeline = cl.user_session.get("chain")

    msg = cl.Message(content="")
    result = await chain.arun_pipeline(message.content)

    async for stream_resp in result.get('response'):
        await msg.stream_token(stream_resp)

    await msg.send()
    cl.user_session.set("chain", chain)


if __name__ == "__main__":
    run_chainlit(__file__)