Spaces:

alexandraroze
/

rag_test_task

Sleeping

App Files Files Community

alexandraroze commited on Jul 11, 2024

Commit

197a291

1 Parent(s): 050c8b5

init chat

Browse files

Files changed (8) hide show

app.py +91 -62
build_rag.py +24 -0
requirements.txt +4 -1
src/__init__.py +0 -0
src/chat.py +94 -0
src/prompts.py +31 -0
src/rag.py +166 -0
templates/template_html.j2 +92 -0

app.py CHANGED Viewed

@@ -1,63 +1,92 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import logging
+from pathlib import Path
 import gradio as gr
+import os
+from jinja2 import Environment, FileSystemLoader
+from src.chat import Chat
+from src.rag import FaissDB, AICompletion, define_query
+from src.prompts import *
+chat_model = AICompletion()
+chat = Chat(system_prompt=SYSTEM_PROMPT)
+faiss_index = FaissDB(emb_model=os.environ["OPENAI_EMBEDDINGS_MODEL"])
+faiss_index.load_index(os.environ["PATH_TO_INDEX"])
+proj_dir = Path(__file__).parent
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+env = Environment(loader=FileSystemLoader(proj_dir / 'templates'))
+template_html = env.get_template('template_html.j2')
+def add_text(text, history):
+    history = [] if history is None else history
+    history = history + [(text, None)]
+    return gr.Textbox(value="", interactive=False), gr.Textbox(value="", interactive=False), gr.Textbox(value="", interactive=False), history
+def turn_on_activity():
+    return gr.Textbox(interactive=True), gr.Textbox(interactive=True), gr.Textbox(interactive=True)
+def bot(history):
+    user_query = history[-1][0]
+    if not user_query:
+        raise gr.Warning("Please submit a non-empty string")
+    retrieve_query = define_query(user_query, chat_model)
+    documents = faiss_index.similarity_search(retrieve_query) if retrieve_query else ''
+    user_prompt = USER_PROMPT(user_query, documents)
+    prompt_html = template_html.render(documents=documents, query=user_query)
+    stream = chat.stream(user_prompt)
+    history[-1][1] = ""
+    for character in stream:
+        history[-1][1] = character
+        yield history, prompt_html
+with (gr.Blocks() as demo):
+    chatbot = gr.Chatbot(
+        [],
+        elem_id="chatbot",
+        avatar_images=('https://aui.atlassian.com/aui/8.8/docs/images/avatar-person.svg',
+                       'https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg'),
+        bubble_full_width=False,
+        show_copy_button=True,
+        show_share_button=True,
+    )
+    with gr.Row():
+        txt = gr.Textbox(
+            scale=4,
+            show_label=False,
+            placeholder="Enter text",
+            container=False,
+        )
+        txt_btn = gr.Button(value="Submit text", scale=1)
+    prompt_html = gr.HTML()
+    txt_msg = txt_btn.click(
+        add_text, [txt, chatbot], [txt, chatbot], queue=False
+    ).then(
+        bot, [chatbot], [chatbot, prompt_html]
+    )
+    txt_msg.then(turn_on_activity, None, [txt], queue=False)
+    txt_msg = txt.submit(
+        add_text, [txt, chatbot], [txt, chatbot], queue=False
+    ).then(
+        bot, [chatbot], [chatbot, prompt_html]
+    )
+    txt_msg.then(turn_on_activity, None, [txt], queue=False)
+demo.queue()
+demo.launch(debug=True)

build_rag.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from src.rag import CustomAgglomerativeSplitter, FaissDB
+import argparse
+from dotenv import load_dotenv
+import os
+load_dotenv()
+def main(path_to_dataset: str, path_to_index: str):
+    splitter = CustomAgglomerativeSplitter(emb_model=os.getenv("OPENAI_EMBEDDINGS_MODEL"))
+    documents = splitter.read_and_split(path_to_dataset)
+    faiss_db = FaissDB(emb_model=os.getenv("OPENAI_EMBEDDINGS_MODEL"))
+    faiss_db.init_index(documents)
+    faiss_db.save_index(path_to_index)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path_to_dataset", type=str, required=True)
+    parser.add_argument("--path_to_index", type=str, required=True)
+    args = parser.parse_args()
+    main(args.path_to_dataset, args.path_to_index)

requirements.txt CHANGED Viewed

@@ -4,4 +4,7 @@ langchain-community==0.2.7
 langchain-openai==0.1.15
 nltk==3.8.1
 textract==1.6.5
-faiss-cpu==1.8.0.post1

 langchain-openai==0.1.15
 nltk==3.8.1
 textract==1.6.5
+faiss-cpu==1.8.0.post1
+numpy==1.26.4
+python-dotenv==1.0.1
+langchain_groq==0.1.6

src/__init__.py ADDED Viewed

File without changes

src/chat.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import gradio as gr
+from langchain_community.llms import OpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai.chat_models import ChatOpenAI
+GENERATE_ARGS = {
+    'temperature': max(float(os.getenv("TEMPERATURE", 0.3)), 1e-2),
+    'max_tokens': int(os.getenv("MAX_NEW_TOKENS", 512)),
+}
+GENERATE_KWARGS = {
+    'top_p': float(os.getenv("TOP_P", 0.6)),
+    'frequency_penalty': max(-2, min(float(os.getenv("FREQ_PENALTY", 0)), 2))
+}
+class Chat:
+    def __init__(self, system_prompt: str):
+        base = ChatOpenAI
+        model = os.getenv("OPENAI_MODEL")
+        self.assistant_model = base(
+            model=model,
+            streaming=True,
+            **GENERATE_ARGS,
+            model_kwargs=GENERATE_KWARGS
+        )
+        self.store = {}
+        self.prompt = ChatPromptTemplate.from_messages([
+            ("system", system_prompt),
+            MessagesPlaceholder(variable_name="history"),
+            ("human", "{input}")
+        ])
+        self.runnable = self.prompt | self.assistant_model
+        self.chat_model = RunnableWithMessageHistory(
+            self.runnable,
+            self.get_session_history,
+            input_messages_key="input",
+            history_messages_key="history",
+        )
+    def format_prompt(self, system_prompt: str, user_prompt: str):
+        messages = [
+            SystemMessage(
+                content=system_prompt
+            ),
+            HumanMessage(
+                content=user_prompt
+            ),
+        ]
+        return messages
+    def get_session_history(self, session_id: (str | int)) -> BaseChatMessageHistory:
+        if session_id not in self.store:
+            self.store[session_id] = ChatMessageHistory()
+        return self.store[session_id]
+    def stream(self, user_prompt: str, session_id: (str | int) = 0):
+        try:
+            stream_answer = self.chat_model.stream(
+                {"input": user_prompt},
+                config={"configurable": {"session_id": session_id}},
+            )
+            output = ""
+            for response in stream_answer:
+                if type(self.assistant_model) == OpenAI:
+                    if response.choices[0].delta.content:
+                        output += response.choices[0].delta.content
+                        yield output
+                else:
+                    output += response.content
+                    yield output
+        except Exception as e:
+            if "Too Many Requests" in str(e):
+                raise gr.Error(f"Too many requests: {str(e)}")
+            elif "Authorization header is invalid" in str(e):
+                raise gr.Error("Authentication error: API token was either not provided or incorrect")
+            else:
+                raise gr.Error(f"Unhandled Exception: {str(e)}")

src/prompts.py ADDED Viewed

	@@ -0,0 +1,31 @@

+DEFINE_QUERY_PROMPT = """
+Prompt:
+You must identify if user's query is about a specific topic or it's a follow-up question.
+If user asks about a specific topic, you must extract this topic and return it.
+If the question is a follow-up query without mentioning any specific topics, you must return "Unrelated."
+Example 1 (Extract topic):
+User: Could you please explain what is Faiss. Thanks!
+Your response: What is Faiss?
+In this case your response must include only the topic name without any additional information or comments.
+Example 2 (Follow-up or Unrelated):
+User: Could you clarify the third point you mentioned earlier?
+Your response: Unrelated.
+In this case your response must be "Unrelated." without any additional information or comments.
+"""
+SYSTEM_PROMPT = """
+Your task is to answer user's questions. You must provide clear and concise answers to user's queries.
+If user provides any documents in 'Documents' section, your answer must be based on the information from these documents.
+If this section is empty, it means that user asks follow-up questions or questions that are not related to any specific topic.
+"""
+USER_PROMPT = """
+User query:
+{0}
+Documents:
+{1}
+"""

src/rag.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain.docstore.document import Document
+import nltk
+import os
+import numpy as np
+import textract
+from collections import defaultdict
+from langchain_community.vectorstores import FAISS
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from src.prompts import DEFINE_QUERY_PROMPT
+from typing import Optional
+class AgglomerativeClustering:
+    def __init__(self, n_clusters: int = 16):
+        self.n_clusters = n_clusters
+        self.inf = 1e16
+        self.sample_size = 0
+        self._distances = None
+    def _init_clusters(self, X: np.array):
+        distances = self.distance(XA=X, XB=X) + np.eye(self.sample_size) * self.inf
+        clusters = [[i] for i in range(self.sample_size)]
+        return distances, clusters
+    def _average(self, clusters, min_cluster, max_cluster):
+        return (self._distances[min_cluster] * len(clusters[min_cluster]) + self._distances[max_cluster] * len(
+            clusters[max_cluster])) / (len(clusters[min_cluster]) + len(clusters[max_cluster]))
+    def _get_params(self, counter):
+        min_distance = np.argmin(self._distances)
+        param_1 = min_distance // counter
+        param_2 = min_distance % counter
+        return min(param_1, param_2), max(param_1, param_2)
+    def _merge_clusters(self, clusters, min_cluster, max_cluster):
+        self._distances[:, min_cluster] = self._distances[min_cluster, :]
+        self._distances = np.delete(self._distances, max_cluster, axis=0)
+        self._distances = np.delete(self._distances, max_cluster, axis=1)
+        self._distances[min_cluster][min_cluster] = np.inf
+        clusters[min_cluster].extend(clusters[max_cluster])
+        clusters.pop(max_cluster)
+    def _get_labels(self, clusters):
+        result = [0] * self.sample_size
+        for cluster in range(len(clusters)):
+            for dote in clusters[cluster]:
+                result[dote] = cluster
+        return result
+    def fit_predict(self, X: np.array) -> np.array:
+        self.sample_size = X.shape[0]
+        self._distances, clusters = self._init_clusters(X)
+        while len(clusters) > self.n_clusters:
+            min_cluster, max_cluster = self._get_params(len(clusters))
+            if max(clusters[min_cluster]) + 1 == min(clusters[max_cluster]):
+                self._distances[min_cluster] = self._average(clusters=clusters, min_cluster=min_cluster,
+                                                             max_cluster=max_cluster)
+                self._merge_clusters(clusters=clusters, min_cluster=min_cluster, max_cluster=max_cluster)
+            else:
+                self._distances[min_cluster, max_cluster] = self.inf
+                self._distances[max_cluster, min_cluster] = self.inf
+        return np.array(self._get_labels(clusters))
+    @staticmethod
+    def distance(XA, XB):
+        return np.sqrt(((XA[:, np.newaxis] - XB[np.newaxis, :]) ** 2).sum(axis=2))
+class CustomAgglomerativeSplitter:
+    def __init__(self, emb_model: str):
+        self._embeddings_model = OpenAIEmbeddings(model=emb_model)
+    @staticmethod
+    def read_pdfs(path: str) -> tuple[list, list]:
+        files = os.listdir(path)
+        pages = []
+        file_names = []
+        for file in files:
+            page = textract.process(f"{path}/{file}", method='pdfminer').decode('utf-8').replace('\n', ' ')
+            text = nltk.sent_tokenize(page)
+            pages.append(text)
+            file_names.append(file)
+        return pages, file_names
+    def get_embeddings(self, pages: list) -> list[np.array]:
+        return [np.array(self._embeddings_model.embed_documents(texts)) for texts in pages]
+    @staticmethod
+    def split_list_by_indexes(data: list, indexes: list) -> list:
+        result_dict = defaultdict(list)
+        for element, index in zip(data, indexes):
+            result_dict[index].append(element)
+        return list(result_dict.values())
+    @staticmethod
+    def balance_pages(pages: list, max_tokens: int = 256) -> list:
+        balanced_pages = []
+        for page in pages:
+            str_page = ' '.join(page)
+            if len(str_page.split()) > max_tokens:
+                n_of_pages = int(np.ceil(len(str_page.split()) / max_tokens))
+                result = [' '.join(list(res)) for res in np.array_split(page, n_of_pages)]
+                balanced_pages.extend(result)
+            else:
+                balanced_pages.append(' '.join(page))
+        return balanced_pages
+    def cluster_pages(self, pages: list, embeddings: list, file_names: list, mean_n_of_sentences: int = 5) -> list:
+        documents = []
+        for page_number, page in enumerate(pages):
+            sentence_embeddings = embeddings[page_number]
+            n_clusters = len(page) // mean_n_of_sentences
+            model = AgglomerativeClustering(n_clusters=n_clusters)
+            labels = model.fit_predict(sentence_embeddings)
+            page_docs = self.split_list_by_indexes(page, labels)
+            page_docs = self.balance_pages(page_docs)
+            documents.extend([
+                Document(page_content=text, metadata={"file_name": file_names[page_number]}) for text in page_docs
+            ])
+        return documents
+    def read_and_split(self, path: str) -> list:
+        pages, file_names = self.read_pdfs(path)
+        embeddings = self.get_embeddings(pages)
+        return self.cluster_pages(pages, embeddings, file_names)
+class FaissDB:
+    def __init__(self, emb_model):
+        self._embeddings_model = OpenAIEmbeddings(model=emb_model)
+        self.index = None
+    def init_index(self, documents: list[Document]):
+        self.index = FAISS.from_documents(documents, self._embeddings_model)
+    def save_index(self, path: str):
+        self.index.save_local(path)
+    def load_index(self, path: str):
+        self.index = FAISS.load_local(path, self._embeddings_model, allow_dangerous_deserialization=True)
+    def similarity_search(self, query: str, k: int = 5):
+        if self.index is None:
+            raise ValueError("Index is not initialized")
+        return self.index.similarity_search(query, k)
+class AICompletion:
+    def __init__(self, chat_model: str = "gpt-4o", temperature: float = 0.0):
+        self.human = "{text}"
+        self.model = ChatOpenAI(model=chat_model, temperature=temperature)
+    def get_answer(self, system_prompt: str, text: str) -> (str | None):
+        prompt = ChatPromptTemplate.from_messages([("system", system_prompt),
+                                                   ("human", self.human)])
+        chain = prompt | self.model
+        return chain.invoke({"text": text}).content
+def define_query(query: str, chat_model: AICompletion) -> Optional[str]:
+    result = chat_model.get_answer(DEFINE_QUERY_PROMPT, query)
+    return result if result != "Unrelated." else None

templates/template_html.j2 ADDED Viewed

	@@ -0,0 +1,92 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Information Page</title>
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&amp;display=swap">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&amp;display=swap">
+    <style>
+        * {
+            font-family: "Source Sans Pro";
+        }
+        .instructions > * {
+          color: #111 !important;
+        }
+        details.doc-box * {
+          color: #111 !important;
+        }
+        .dark {
+            background: #111;
+            color: white;
+        }
+        .doc-box {
+            padding: 10px;
+            margin-top: 10px;
+            background-color: #baecc2;
+            border-radius: 6px;
+            color: #111 !important;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+        .doc-full {
+            margin: 10px 14px;
+            line-height: 1.6rem;
+        }
+        .instructions {
+            color: #111 !important;
+            background: #b7bdfd;
+            display: block;
+            border-radius: 6px;
+            padding: 6px 10px;
+            line-height: 1.6rem;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+        .query {
+            color: #111 !important;
+            background: #ffbcbc;
+            display: block;
+            border-radius: 6px;
+            padding: 6px 10px;
+            line-height: 2rem;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+    </style>
+</head>
+<body>
+<div class="prose svelte-1ybaih5" id="component-6">
+<h2>Prompt</h2>
+Below is the prompt that is given to the model. <hr>
+{#<h2>Instructions</h2>#}
+{#    <span class="instructions">{{ instructions }}</span>#}
+<h2>Context</h2>
+{% for doc in documents %}
+    <details class="doc-box">
+        <summary>
+            <b>Doc {{ loop.index }}:</b> <span class="doc-short">{{ doc[:100] }}...</span>
+        </summary>
+        <div class="doc-full">{{ doc }}</div>
+    </details>
+{% endfor %}
+    <h2>Query</h2>
+    <span class="query">{{ query }}</span>
+</div>
+<script>
+document.addEventListener("DOMContentLoaded", function() {
+    const detailsElements = document.querySelectorAll('.doc-box');
+    detailsElements.forEach(detail => {
+        detail.addEventListener('toggle', function() {
+            const docShort = this.querySelector('.doc-short');
+            if (this.open) {
+                docShort.style.display = 'none';
+            } else {
+                docShort.style.display = 'inline';
+            }
+        });
+    });
+});
+</script>
+</body>
+</html>