DJOMGA TOUKO Peter Charles commited on
Commit
6183ded
·
1 Parent(s): f2d8dc5

Intial commit with embetting splits in 07 parts. The Content if Crawl automatically fron the website of irembo support

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [server]
2
+ runOnSave = true
3
+ headless = true
4
+ maxUploadSize = 2000
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from openai import OpenAI
3
+ from app_kb_handler import *
4
+
5
+
6
+ model = "gpt-3.5-turbo"
7
+
8
+ # ------------------------------------------------------------------------------------------------
9
+ # SIDEBAR
10
+ # ------------------------------------------------------------------------------------------------
11
+ st.sidebar.title('OpenAI Knowledge Base of Irembo')
12
+ st.sidebar.write('This chat bot is build with RAG architecture and OpenAI as LLM. All the Knowledge Base have been crawl automatically from the website https://support.irembo.gov.rw/ ')
13
+
14
+
15
+ def onchange_openai_key():
16
+ print(st.session_state.openai_key)
17
+
18
+ openai_key = st.sidebar.text_input('OpenAI key', on_change=onchange_openai_key, key='openai_key')
19
+
20
+ def submit_openai_key(model=model):
21
+ if(openai_key == None or openai_key==''):
22
+ st.sidebar.write('Please provide the key before')
23
+ return
24
+ else:
25
+ client = OpenAI(api_key=openai_key)
26
+ model = model
27
+ completion = client.chat.completions.create(
28
+ model=model,
29
+ messages=[
30
+ {"role": "system", "content": "You are an assistant giving simple and short answer for question of child"},
31
+ {"role": "user", "content": "count from 0 to 10"}
32
+ ]
33
+ )
34
+ st.sidebar.write(f'Simple count : {completion.choices[0].message.content}')
35
+
36
+ submit_key = st.sidebar.button(label='Submit', on_click=submit_openai_key)
37
+
38
+
39
+
40
+ # ------------------------------------------------------------------------------------------------
41
+ # CHAT TITLE
42
+ # ------------------------------------------------------------------------------------------------
43
+
44
+ st.title('OpenAI Knowledge Base of Irembo')
45
+ st.write(f'Ask any question regarding using Irembo platform to apply for any services.')
46
+
47
+ def askQuestion(model=model, question=''):
48
+ if(openai_key == None or openai_key==''):
49
+ print('Please provide the key before')
50
+ return 'LLM API is not defined. Please provide the key before'
51
+ else:
52
+ if "df" not in st.session_state:
53
+ st.session_state.df = get_embeddings()
54
+ return answer_question(api_key=openai_key, question=f'{question}', df=st.session_state.df, model=model)
55
+
56
+
57
+
58
+ # Initialize chat history
59
+ if "messages" not in st.session_state:
60
+ st.session_state.messages = []
61
+
62
+ # Display chat messages from history on app rerun
63
+ for message in st.session_state.messages:
64
+ with st.chat_message(message["role"]):
65
+ st.markdown(message["content"])
66
+
67
+ # React to user input
68
+ if prompt := st.chat_input("What is up?"):
69
+ with st.status('Running', expanded=True) as status:
70
+ # Display user message in chat message container
71
+ st.chat_message("user").markdown(prompt)
72
+ # Add user message to chat history
73
+ st.session_state.messages.append({"role": "user", "content": prompt})
74
+
75
+ response = askQuestion(question=prompt)
76
+ # Display assistant response in chat message container
77
+ with st.chat_message("assistant"):
78
+ st.markdown(response)
79
+
80
+ # Add assistant response to chat history
81
+ st.session_state.messages.append({"role": "assistant", "content": response})
82
+ status.update(label='Reponse of last question', state="complete", expanded=True)
83
+
84
+
85
+
86
+
87
+
app_kb_handler.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preview the embeddings created
2
+ import pandas as pd
3
+ import numpy as np
4
+ from ast import literal_eval
5
+ # Define function to calculate distances from embeddings and answer question using embeddings search
6
+ from typing import List, Optional
7
+ from scipy import spatial
8
+ # Generate embeddings using OpenAI API
9
+ from openai import OpenAI
10
+ import os
11
+
12
+
13
+ GPT_MODEL = "text-davinci-003"
14
+
15
+
16
+ def get_embeddings():
17
+ df1=pd.read_csv('processed/embeddings-1.csv')
18
+ df2=pd.read_csv('processed/embeddings-2.csv')
19
+ df3=pd.read_csv('processed/embeddings-3.csv')
20
+ df4=pd.read_csv('processed/embeddings-4.csv')
21
+ df5=pd.read_csv('processed/embeddings-5.csv')
22
+ df6=pd.read_csv('processed/embeddings-6.csv')
23
+ df7=pd.read_csv('processed/embeddings-7.csv')
24
+
25
+ df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
26
+ df.columns = ['text', 'n_tokens', 'embedding']
27
+ df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)
28
+ #df.head()
29
+ return df
30
+
31
+
32
+ def distances_from_embeddings(
33
+ query_embedding: List[float],
34
+ embeddings: List[List[float]],
35
+ distance_metric="cosine",
36
+ ) -> List[List]:
37
+ """Return the distances between a query embedding and a list of embeddings."""
38
+ distance_metrics = {
39
+ "cosine": spatial.distance.cosine,
40
+ "L1": spatial.distance.cityblock,
41
+ "L2": spatial.distance.euclidean,
42
+ "Linf": spatial.distance.chebyshev,
43
+ }
44
+ distances = [
45
+ distance_metrics[distance_metric](query_embedding, embedding)
46
+ for embedding in embeddings
47
+ ]
48
+ return distances
49
+
50
+ def create_context(
51
+ question, df, client, max_len=1800, size="ada",
52
+ ):
53
+ """
54
+ Create a context for a question by finding the most similar context from the dataframe
55
+ """
56
+
57
+ # Get the embeddings for the question
58
+ q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding
59
+
60
+ # Get the distances from the embeddings
61
+ df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine')
62
+
63
+ returns = []
64
+ cur_len = 0
65
+
66
+ # Sort by distance and add the text to the context until the context is too long
67
+ for i, row in df.sort_values('distances', ascending=True).iterrows():
68
+
69
+ # Add the length of the text to the current length
70
+ cur_len += row['n_tokens'] + 4
71
+
72
+ # If the context is too long, break
73
+ if cur_len > max_len:
74
+ break
75
+
76
+ # Else add it to the text that is being returned
77
+ returns.append(row['text'])
78
+
79
+ # Return the context
80
+ return "\n\n###\n\n".join(returns)
81
+
82
+ def answer_question(
83
+ df,
84
+ model=GPT_MODEL,
85
+ question="Am I allowed to publish model outputs to Twitter, without a human review?",
86
+ max_len=1800,
87
+ size="ada",
88
+ debug=False,
89
+ max_tokens=150,
90
+ stop_sequence=None,
91
+ api_key="fake"
92
+ ):
93
+ client=OpenAI(api_key=api_key)
94
+ """
95
+ Answer a question based on the most similar context from the dataframe texts
96
+ """
97
+ context = create_context(
98
+ question,
99
+ df,
100
+ client=client,
101
+ max_len=max_len,
102
+ size=size
103
+ )
104
+ # If debug, print the raw model response
105
+ if debug:
106
+ print("Context:\n" + context)
107
+ print("\n\n")
108
+
109
+ try:
110
+
111
+ response = client.chat.completions.create(
112
+ model=model,
113
+ messages=[
114
+ {"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"},
115
+ {"role": "user", "content": f"Question: {question}"}
116
+ ]
117
+ )
118
+ print(response)
119
+ return response.choices[0].message.content
120
+ except Exception as e:
121
+ print(e)
122
+ return f'Error processing {e.__cause__}: {e.message}'
openai-web-qa-1-crawl-website.ipynb ADDED
File without changes
openai-web-qa-2-process-files.ipynb ADDED
File without changes
openai-web-qa-3-tokenize-files.ipynb ADDED
File without changes
openai-web-qa-6-utilitaire.ipynb ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "text 1278\n",
13
+ "n_tokens 1278\n",
14
+ "embedding 1278\n",
15
+ "dtype: int64\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "import pandas as pd\n",
21
+ "\n",
22
+ "df1=pd.read_csv('processed/embeddings-1.csv')\n",
23
+ "df2=pd.read_csv('processed/embeddings-2.csv')\n",
24
+ "df3=pd.read_csv('processed/embeddings-3.csv')\n",
25
+ "df4=pd.read_csv('processed/embeddings-4.csv')\n",
26
+ "df5=pd.read_csv('processed/embeddings-5.csv')\n",
27
+ "df6=pd.read_csv('processed/embeddings-6.csv')\n",
28
+ "df7=pd.read_csv('processed/embeddings-7.csv')\n",
29
+ "\n",
30
+ "df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)\n",
31
+ "df.columns = ['text', 'n_tokens', 'embedding']\n",
32
+ "# df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)\n",
33
+ "df.head()\n",
34
+ "print(df.count())\n",
35
+ " "
36
+ ]
37
+ }
38
+ ],
39
+ "metadata": {
40
+ "kernelspec": {
41
+ "display_name": "sample-projects",
42
+ "language": "python",
43
+ "name": "python3"
44
+ },
45
+ "language_info": {
46
+ "codemirror_mode": {
47
+ "name": "ipython",
48
+ "version": 3
49
+ },
50
+ "file_extension": ".py",
51
+ "mimetype": "text/x-python",
52
+ "name": "python",
53
+ "nbconvert_exporter": "python",
54
+ "pygments_lexer": "ipython3",
55
+ "version": "3.12.2"
56
+ }
57
+ },
58
+ "nbformat": 4,
59
+ "nbformat_minor": 2
60
+ }
processed/embeddings-1.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-2.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-3.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-4.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-5.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-6.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/embeddings-7.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ openai
3
+ watchdog
4
+ pandas
5
+ numpy
6
+ scipy
7
+ typing