Spaces:

p-touko
/

openai-knowledgebase-irembo

Sleeping

App Files Files Community

DJOMGA TOUKO Peter Charles commited on Apr 28, 2024

Commit

6183ded

1 Parent(s): f2d8dc5

Intial commit with embetting splits in 07 parts. The Content if Crawl automatically fron the website of irembo support

Browse files

Files changed (15) hide show

.streamlit/config.toml +4 -0
app.py +87 -0
app_kb_handler.py +122 -0
openai-web-qa-1-crawl-website.ipynb +0 -0
openai-web-qa-2-process-files.ipynb +0 -0
openai-web-qa-3-tokenize-files.ipynb +0 -0
openai-web-qa-6-utilitaire.ipynb +60 -0
processed/embeddings-1.csv +0 -0
processed/embeddings-2.csv +0 -0
processed/embeddings-3.csv +0 -0
processed/embeddings-4.csv +0 -0
processed/embeddings-5.csv +0 -0
processed/embeddings-6.csv +0 -0
processed/embeddings-7.csv +0 -0
requirements.txt +7 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,4 @@

+[server]
+runOnSave = true
+headless = true
+maxUploadSize = 2000

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import streamlit as st
+from openai import OpenAI
+from app_kb_handler import *
+model = "gpt-3.5-turbo"
+# ------------------------------------------------------------------------------------------------
+# SIDEBAR
+# ------------------------------------------------------------------------------------------------
+st.sidebar.title('OpenAI Knowledge Base of Irembo')
+st.sidebar.write('This chat bot is build with RAG architecture and OpenAI as LLM. All the Knowledge Base have been crawl automatically from the website https://support.irembo.gov.rw/ ')
+def onchange_openai_key():
+    print(st.session_state.openai_key)
+openai_key = st.sidebar.text_input('OpenAI key', on_change=onchange_openai_key, key='openai_key')
+def submit_openai_key(model=model):
+    if(openai_key == None or openai_key==''):
+        st.sidebar.write('Please provide the key before')
+        return
+    else:
+        client = OpenAI(api_key=openai_key)
+        model = model
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are an assistant giving simple and short answer for question of child"},
+                {"role": "user", "content": "count from 0 to 10"}
+            ]
+        )
+        st.sidebar.write(f'Simple count : {completion.choices[0].message.content}')
+submit_key = st.sidebar.button(label='Submit', on_click=submit_openai_key)
+# ------------------------------------------------------------------------------------------------
+# CHAT TITLE
+# ------------------------------------------------------------------------------------------------
+st.title('OpenAI Knowledge Base of Irembo')
+st.write(f'Ask any question regarding using Irembo platform to apply for any services.')
+def askQuestion(model=model, question=''):
+    if(openai_key == None or openai_key==''):
+        print('Please provide the key before')
+        return 'LLM API is not defined. Please provide the key before'
+    else:
+        if "df" not in st.session_state:
+            st.session_state.df = get_embeddings()
+        return answer_question(api_key=openai_key, question=f'{question}', df=st.session_state.df, model=model)
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# React to user input
+if prompt := st.chat_input("What is up?"):
+    with st.status('Running', expanded=True) as status:
+        # Display user message in chat message container
+        st.chat_message("user").markdown(prompt)
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        response = askQuestion(question=prompt)
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            st.markdown(response)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})
+        status.update(label='Reponse of last question', state="complete", expanded=True)

app_kb_handler.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Preview the embeddings created
+import pandas as pd
+import numpy as np
+from ast import literal_eval
+# Define function to calculate distances from embeddings and answer question using embeddings search
+from typing import List, Optional
+from scipy import spatial
+# Generate embeddings using OpenAI API
+from openai import OpenAI
+import os
+GPT_MODEL = "text-davinci-003"
+def get_embeddings():
+    df1=pd.read_csv('processed/embeddings-1.csv')
+    df2=pd.read_csv('processed/embeddings-2.csv')
+    df3=pd.read_csv('processed/embeddings-3.csv')
+    df4=pd.read_csv('processed/embeddings-4.csv')
+    df5=pd.read_csv('processed/embeddings-5.csv')
+    df6=pd.read_csv('processed/embeddings-6.csv')
+    df7=pd.read_csv('processed/embeddings-7.csv')
+    df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
+    df.columns = ['text', 'n_tokens', 'embedding']
+    df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)
+    #df.head()
+    return df
+def distances_from_embeddings(
+    query_embedding: List[float],
+    embeddings: List[List[float]],
+    distance_metric="cosine",
+) -> List[List]:
+    """Return the distances between a query embedding and a list of embeddings."""
+    distance_metrics = {
+        "cosine": spatial.distance.cosine,
+        "L1": spatial.distance.cityblock,
+        "L2": spatial.distance.euclidean,
+        "Linf": spatial.distance.chebyshev,
+    }
+    distances = [
+        distance_metrics[distance_metric](query_embedding, embedding)
+        for embedding in embeddings
+    ]
+    return distances
+def create_context(
+    question, df, client, max_len=1800, size="ada",
+):
+    """
+    Create a context for a question by finding the most similar context from the dataframe
+    """
+    # Get the embeddings for the question
+    q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding
+    # Get the distances from the embeddings
+    df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine')
+    returns = []
+    cur_len = 0
+    # Sort by distance and add the text to the context until the context is too long
+    for i, row in df.sort_values('distances', ascending=True).iterrows():
+        # Add the length of the text to the current length
+        cur_len += row['n_tokens'] + 4
+        # If the context is too long, break
+        if cur_len > max_len:
+            break
+        # Else add it to the text that is being returned
+        returns.append(row['text'])
+    # Return the context
+    return "\n\n###\n\n".join(returns)
+def answer_question(
+    df,
+    model=GPT_MODEL,
+    question="Am I allowed to publish model outputs to Twitter, without a human review?",
+    max_len=1800,
+    size="ada",
+    debug=False,
+    max_tokens=150,
+    stop_sequence=None,
+    api_key="fake"
+):
+    client=OpenAI(api_key=api_key)
+    """
+    Answer a question based on the most similar context from the dataframe texts
+    """
+    context = create_context(
+        question,
+        df,
+        client=client,
+        max_len=max_len,
+        size=size
+    )
+    # If debug, print the raw model response
+    if debug:
+        print("Context:\n" + context)
+        print("\n\n")
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"},
+                {"role": "user", "content": f"Question: {question}"}
+            ]
+        )
+        print(response)
+        return response.choices[0].message.content
+    except Exception as e:
+        print(e)
+        return f'Error processing {e.__cause__}: {e.message}'

openai-web-qa-1-crawl-website.ipynb ADDED Viewed

File without changes

openai-web-qa-2-process-files.ipynb ADDED Viewed

File without changes

openai-web-qa-3-tokenize-files.ipynb ADDED Viewed

File without changes

openai-web-qa-6-utilitaire.ipynb ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "text         1278\n",
+      "n_tokens     1278\n",
+      "embedding    1278\n",
+      "dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df1=pd.read_csv('processed/embeddings-1.csv')\n",
+    "df2=pd.read_csv('processed/embeddings-2.csv')\n",
+    "df3=pd.read_csv('processed/embeddings-3.csv')\n",
+    "df4=pd.read_csv('processed/embeddings-4.csv')\n",
+    "df5=pd.read_csv('processed/embeddings-5.csv')\n",
+    "df6=pd.read_csv('processed/embeddings-6.csv')\n",
+    "df7=pd.read_csv('processed/embeddings-7.csv')\n",
+    "\n",
+    "df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)\n",
+    "df.columns = ['text', 'n_tokens', 'embedding']\n",
+    "# df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)\n",
+    "df.head()\n",
+    "print(df.count())\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sample-projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

processed/embeddings-1.csv ADDED Viewed