naotakigawa commited on
Commit
8b16906
·
1 Parent(s): 1301e19

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +156 -175
  2. common.py +161 -51
  3. log.py +5 -0
  4. pages/Chatbot.py +14 -26
  5. pages/ChatbotWebRead.py +20 -21
  6. pages/ImportAllFile.py +76 -0
app.py CHANGED
@@ -1,175 +1,156 @@
1
- import streamlit as st
2
- import os
3
- import pickle
4
- import faiss
5
- import logging
6
-
7
- from multiprocessing import Lock
8
- from multiprocessing.managers import BaseManager
9
- from llama_index.callbacks import CallbackManager, LlamaDebugHandler
10
- from llama_index import VectorStoreIndex, Document,Prompt, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
11
- from llama_index.chat_engine import CondenseQuestionChatEngine;
12
- from llama_index.node_parser import SimpleNodeParser
13
- from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
14
- from llama_index.constants import DEFAULT_CHUNK_OVERLAP
15
- from llama_index.response_synthesizers import get_response_synthesizer
16
- from llama_index.vector_stores.faiss import FaissVectorStore
17
- from llama_index.graph_stores import SimpleGraphStore
18
- from llama_index.storage.docstore import SimpleDocumentStore
19
- from llama_index.storage.index_store import SimpleIndexStore
20
- import tiktoken
21
- from streamlit import runtime
22
- from streamlit.runtime.scriptrunner import get_script_run_ctx
23
- import ipaddress
24
- import streamlit_authenticator as stauth
25
- import yaml
26
- from requests_oauthlib import OAuth2Session
27
- from time import time
28
- from dotenv import load_dotenv
29
- from streamlit import net_util
30
-
31
- load_dotenv()
32
-
33
- # 接続元制御
34
- ALLOW_IP_ADDRESS = os.environ["ALLOW_IP_ADDRESS"]
35
-
36
- index_name = "./data/storage"
37
- pkl_name = "./data/stored_documents.pkl"
38
-
39
- custom_prompt = Prompt("""\
40
- 以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
41
- 会話と新しい会話文に基づいて、検索クエリを作成します。回答は日本語で行います。
42
- 新しい会話文が挨拶の場合、挨拶を返してください。
43
- 新しい会話文が質問の場合、検索した結果の回答を返してください。
44
- 答えがわからない場合は正直にわからないと回答してください。
45
- 会話履歴:
46
- {chat_history}
47
- 新しい会話文:
48
- {question}
49
- Search query:
50
- """)
51
-
52
- chat_history = []
53
-
54
- logging.basicConfig(level=logging.INFO)
55
- logger = logging.getLogger("__name__")
56
- logger.debug("調査用ログ")
57
-
58
- def initialize_index():
59
- logger.info("initialize_index start")
60
- text_splitter = TokenTextSplitter(separator="。", chunk_size=1500
61
- , chunk_overlap=DEFAULT_CHUNK_OVERLAP
62
- , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
63
- node_parser = SimpleNodeParser(text_splitter=text_splitter)
64
- d = 1536
65
- k=2
66
- faiss_index = faiss.IndexFlatL2(d)
67
- # デバッグ用
68
- llama_debug_handler = LlamaDebugHandler()
69
- callback_manager = CallbackManager([llama_debug_handler])
70
- service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
71
- lock = Lock()
72
- with lock:
73
- if os.path.exists(index_name):
74
- storage_context = StorageContext.from_defaults(
75
- docstore=SimpleDocumentStore.from_persist_dir(persist_dir=index_name),
76
- graph_store=SimpleGraphStore.from_persist_dir(persist_dir=index_name),
77
- vector_store=FaissVectorStore.from_persist_dir(persist_dir=index_name),
78
- index_store=SimpleIndexStore.from_persist_dir(persist_dir=index_name),
79
- )
80
- st.session_state.index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
81
- response_synthesizer = get_response_synthesizer(response_mode='refine')
82
- st.session_state.query_engine = st.session_state.index.as_query_engine(response_synthesizer=response_synthesizer,service_context=service_context)
83
- st.session_state.chat_engine = CondenseQuestionChatEngine.from_defaults(
84
- query_engine=st.session_state.query_engine,
85
- condense_question_prompt=custom_prompt,
86
- chat_history=chat_history,
87
- verbose=True
88
- )
89
- else:
90
- documents = SimpleDirectoryReader("./documents").load_data()
91
- vector_store = FaissVectorStore(faiss_index=faiss_index)
92
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
93
- st.session_state.index = VectorStoreIndex.from_documents(documents, storage_context=storage_context,service_context=service_context)
94
- st.session_state.index.storage_context.persist(persist_dir=index_name)
95
- response_synthesizer = get_response_synthesizer(response_mode='refine')
96
- st.session_state.query_engine = st.session_state.index.as_query_engine(response_synthesizer=response_synthesizer,service_context=service_context)
97
- st.session_state.chat_engine = CondenseQuestionChatEngine.from_defaults(
98
- query_engine=st.session_state.query_engine,
99
- condense_question_prompt=custom_prompt,
100
- chat_history=chat_history,
101
- verbose=True
102
- )
103
- if os.path.exists(pkl_name):
104
- with open(pkl_name, "rb") as f:
105
- st.session_state.stored_docs = pickle.load(f)
106
- else:
107
- st.session_state.stored_docs=list()
108
-
109
- # 接続元IP取得
110
- def get_remote_ip():
111
- ctx = get_script_run_ctx()
112
- session_info = runtime.get_instance().get_client(ctx.session_id)
113
- return session_info.request.remote_ip
114
-
115
- # 接続元IP許可判定
116
- def is_allow_ip_address():
117
- remote_ip = get_remote_ip()
118
- logger.info("remote_ip")
119
- logger.info(remote_ip)
120
- # localhost
121
- if remote_ip == "::1":
122
- return True
123
-
124
- # プライベートIP
125
- ipaddr = ipaddress.IPv4Address(remote_ip)
126
- logger.info("ipaddr")
127
- logger.info(ipaddr)
128
- if ipaddr.is_private:
129
- return True
130
-
131
- # その他(許可リスト判定)
132
- return remote_ip in ALLOW_IP_ADDRESS
133
-
134
- # メイン
135
- def app():
136
- # 初期化
137
- st.session_state["token"] = None
138
- st.session_state["token_expires"] = time()
139
- st.session_state["authorization_state"] = None
140
-
141
- # 接続元IP許可判定
142
- if not is_allow_ip_address():
143
- st.title("HTTP 403 Forbidden")
144
- return
145
-
146
- # 接続元OK
147
- st.title("Azure AD Login with Streamlit")
148
-
149
- with open('config.yaml') as file:
150
- config = yaml.load(file, Loader=yaml.SafeLoader)
151
-
152
- authenticator = stauth.Authenticate(
153
- config['credentials'],
154
- config['cookie']['name'],
155
- config['cookie']['key'],
156
- config['cookie']['expiry_days'],
157
- config['preauthorized'],
158
- )
159
-
160
- name, authentication_status, username = authenticator.login('Login', 'main')
161
-
162
-
163
- if 'authentication_status' not in st.session_state:
164
- st.session_state['authentication_status'] = None
165
-
166
- if st.session_state["authentication_status"]:
167
- authenticator.logout('Logout', 'main')
168
- st.write(f'ログインに成功しました')
169
- initialize_index()
170
- # ここにログイン後の処理を書く。
171
- elif st.session_state["authentication_status"] is False:
172
- st.error('ユーザ名またはパスワードが間違っています')
173
- elif st.session_state["authentication_status"] is None:
174
- st.warning('ユーザ名やパスワードを入力してください')
175
-
 
1
+ import streamlit as st
2
+ import os
3
+ import pickle
4
+ import faiss
5
+ import common
6
+ import glob
7
+ from multiprocessing import Lock
8
+ from multiprocessing.managers import BaseManager
9
+ from pathlib import Path
10
+ from llama_index.callbacks import CallbackManager, LlamaDebugHandler
11
+ from llama_index import Document,VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
12
+ from llama_index.node_parser import SimpleNodeParser
13
+ from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
14
+ from llama_index.constants import DEFAULT_CHUNK_OVERLAP
15
+ from llama_index.vector_stores.faiss import FaissVectorStore
16
+ from llama_index.graph_stores import SimpleGraphStore
17
+ from llama_index.storage.docstore import SimpleDocumentStore
18
+ from llama_index.storage.index_store import SimpleIndexStore
19
+ from msal_streamlit_authentication import msal_authentication
20
+ from llama_hub.file.cjk_pdf.base import CJKPDFReader
21
+ from llama_hub.file.pptx.base import PptxReader
22
+ from llama_hub.file.pandas_excel.base import PandasExcelReader
23
+ from llama_hub.file.docx.base import DocxReader
24
+ from llama_index.llms import OpenAI
25
+ import tiktoken
26
+ from llama_index.callbacks import CallbackManager, LlamaDebugHandler
27
+ from dotenv import load_dotenv
28
+
29
+ load_dotenv()
30
+
31
+ # 接続元制御
32
+ ALLOW_IP_ADDRESS = os.environ["ALLOW_IP_ADDRESS"]
33
+
34
+ # Azure AD app registration details
35
+ CLIENT_ID = os.environ["CLIENT_ID"]
36
+ CLIENT_SECRET = os.environ["CLIENT_SECRET"]
37
+ TENANT_ID = os.environ["TENANT_ID"]
38
+
39
+ # Azure API
40
+ AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
41
+ REDIRECT_URI = os.environ["REDIRECT_URI"]
42
+ SCOPES = ["openid", "profile", "User.Read"]
43
+
44
+ INDEX_NAME = os.environ["INDEX_NAME"]
45
+ PKL_NAME = os.environ["PKL_NAME"]
46
+ st.session_state.llama_debug_handler = LlamaDebugHandler()
47
+ from log import logger
48
+
49
+ def initialize_index():
50
+ logger.info("initialize_index start")
51
+ llm = OpenAI(model='gpt-3.5-turbo', temperature=0.8, max_tokens=256)
52
+ text_splitter = TokenTextSplitter(separator="。",chunk_size=1500
53
+ , chunk_overlap=DEFAULT_CHUNK_OVERLAP
54
+ , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
55
+ node_parser = SimpleNodeParser(text_splitter=text_splitter)
56
+ d = 1536
57
+ k=2
58
+ faiss_index = faiss.IndexFlatL2(d)
59
+ # デバッグ用
60
+ callback_manager = CallbackManager([st.session_state.llama_debug_handler])
61
+ service_context = ServiceContext.from_defaults(llm=llm,node_parser=node_parser,callback_manager=callback_manager)
62
+ lock = Lock()
63
+ with lock:
64
+ if os.path.exists(INDEX_NAME):
65
+ logger.info("start import index")
66
+ storage_context = StorageContext.from_defaults(
67
+ docstore=SimpleDocumentStore.from_persist_dir(persist_dir=INDEX_NAME),
68
+ graph_store=SimpleGraphStore.from_persist_dir(persist_dir=INDEX_NAME),
69
+ vector_store=FaissVectorStore.from_persist_dir(persist_dir=INDEX_NAME),
70
+ index_store=SimpleIndexStore.from_persist_dir(persist_dir=INDEX_NAME),
71
+ )
72
+ st.session_state.index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
73
+ with open(PKL_NAME, "rb") as f:
74
+ st.session_state.stored_docs = pickle.load(f)
75
+ common.setChatEngine()
76
+ else:
77
+ logger.info("start create index")
78
+ documents = list()
79
+ files = glob.glob("./documents/*")
80
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
81
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
82
+ st.session_state.stored_docs=list()
83
+ for file in files:
84
+ loader=None
85
+ noextpath,extension = os.path.splitext(file)
86
+ logger.info(file)
87
+ document = Document()
88
+ if extension == ".txt" or extension ==".md":
89
+ document = SimpleDirectoryReader(input_files=[file], filename_as_id=True).load_data()[0]
90
+ else:
91
+ if extension == ".pdf":
92
+ loader = CJKPDFReader()
93
+ elif extension == ".pptx":
94
+ loader = PptxReader()
95
+ elif extension == ".xlsx":
96
+ loader = PandasExcelReader(pandas_config={"header": 0})
97
+ elif extension == ".docx":
98
+ loader = DocxReader()
99
+ else:
100
+ logger.error("Can`t read file:" + file)
101
+ continue
102
+ document = loader.load_data(file=Path(file))[0]
103
+ document.metadata={'filename': os.path.basename(file)}
104
+ documents.append(document)
105
+ st.session_state.stored_docs.append(os.path.basename(file))
106
+ st.session_state.index = VectorStoreIndex.from_documents( documents=documents,storage_context=storage_context,service_context=service_context)
107
+ st.session_state.index.storage_context.persist(persist_dir=INDEX_NAME)
108
+ with open(PKL_NAME, "wb") as f:
109
+ print("pickle")
110
+ pickle.dump(st.session_state.stored_docs, f)
111
+ common.setChatEngine()
112
+
113
+ def logout():
114
+ st.session_state["login_token"] = None
115
+
116
+ # メイン
117
+ st.session_state["login_token"] = msal_authentication(
118
+ auth={
119
+ "clientId": CLIENT_ID,
120
+ "authority": AUTHORITY,
121
+ "redirectUri": REDIRECT_URI,
122
+ "postLogoutRedirectUri": ""
123
+ }, # Corresponds to the 'auth' configuration for an MSAL Instance
124
+ cache={
125
+ "cacheLocation": "sessionStorage",
126
+ "storeAuthStateInCookie": False
127
+ }, # Corresponds to the 'cache' configuration for an MSAL Instance
128
+ login_request={
129
+ "scopes": SCOPES
130
+ }, # Optional
131
+ logout_request={}, # Optional
132
+ login_button_text="Login", # Optional, defaults to "Login"
133
+ logout_button_text="Logout", # Optional, defaults to "Logout"
134
+ class_name="css_button_class_selector", # Optional, defaults to None. Corresponds to HTML class.
135
+ html_id="html_id_for_button", # Optional, defaults to None. Corresponds to HTML id.
136
+ #key=1 # Optional if only a single instance is needed
137
+ )
138
+ # st.write("Recevied login token:", st.session_state.login_token)
139
+
140
+ if st.session_state.login_token:
141
+ initialize_index()
142
+ st.write("ようこそ", st.session_state.login_token["account"]["name"])
143
+ st.write("サイドメニューからファイルインポート又はChatbotへの質問を開始してください。")
144
+ st.markdown("""
145
+ ## 使い方
146
+ - **Chatbot**
147
+ 初期からインポートされているファイルとImportXXFileでインポートしたファイルの内容に関する質問に対して、GenerativeAIが回答します。
148
+ ※返答が正常に帰ってこない場合があります。参照ファイルを記載しているので、判断の目安にしてください。
149
+
150
+ - **ChatbotWebRead**
151
+ 入力したURLのサイトの情報に関して、GenerativeAIが回答します。
152
+ ImportAllFileの内容は登録されていません。
153
+
154
+ - **ImportAllFile**
155
+ テキストファイル,mdファイル,Excel,PDF,PowerPoint,Wordをインポートできます。
156
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
common.py CHANGED
@@ -1,51 +1,161 @@
1
- # common.py
2
- import extra_streamlit_components as stx
3
- import streamlit as st
4
- import logging
5
- import os
6
-
7
- from time import time
8
- from requests_oauthlib import OAuth2Session
9
- from streamlit import runtime
10
- from streamlit.runtime.scriptrunner import get_script_run_ctx
11
- import ipaddress
12
-
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger("__name__")
15
- logger.debug("調査用ログ")
16
-
17
- # 接続元制御
18
- ALLOW_IP_ADDRESS = os.environ["ALLOW_IP_ADDRESS"]
19
-
20
- # 接続元IP取得
21
- def get_remote_ip():
22
- ctx = get_script_run_ctx()
23
- session_info = runtime.get_instance().get_client(ctx.session_id)
24
- return session_info.request.remote_ip
25
-
26
- # 接続元IP許可判定
27
- def is_allow_ip_address():
28
- remote_ip = get_remote_ip()
29
- logger.info("remote_ip")
30
- logger.info(remote_ip)
31
- # localhost
32
- if remote_ip == "::1":
33
- return True
34
-
35
- # プライベートIP
36
- ipaddr = ipaddress.IPv4Address(remote_ip)
37
- logger.info("ipaddr")
38
- logger.info(ipaddr)
39
- if ipaddr.is_private:
40
- return True
41
-
42
- # その他(許可リスト判定)
43
- return remote_ip in ALLOW_IP_ADDRESS
44
-
45
- #ログインの確認
46
- def check_login():
47
- if 'authentication_status' not in st.session_state:
48
- st.session_state['authentication_status'] = None
49
- if st.session_state["authentication_status"] is None or False:
50
- st.warning("**ログインしてください**")
51
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pickle
4
+ import ipaddress
5
+ import tiktoken
6
+
7
+ from pathlib import Path
8
+ from streamlit import runtime
9
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
10
+ from streamlit.web.server.websocket_headers import _get_websocket_headers
11
+ from llama_index import SimpleDirectoryReader
12
+ from llama_index import Prompt
13
+ from llama_index.chat_engine import CondenseQuestionChatEngine;
14
+ from llama_index.response_synthesizers import get_response_synthesizer
15
+ from llama_index import ServiceContext, SimpleDirectoryReader
16
+ from llama_index.node_parser import SimpleNodeParser
17
+ from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
18
+ from llama_index.constants import DEFAULT_CHUNK_OVERLAP
19
+ from llama_index.response_synthesizers import get_response_synthesizer
20
+ from llama_index.callbacks import CallbackManager
21
+ from llama_index.llms import OpenAI
22
+ from log import logger
23
+
24
+ # 接続元制御
25
+ ALLOW_IP_ADDRESS = os.environ["ALLOW_IP_ADDRESS"]
26
+
27
+ # Azure AD app registration details
28
+ CLIENT_ID = os.environ["CLIENT_ID"]
29
+ CLIENT_SECRET = os.environ["CLIENT_SECRET"]
30
+ TENANT_ID = os.environ["TENANT_ID"]
31
+
32
+ # Azure API
33
+ REDIRECT_URI = os.environ["REDIRECT_URI"]
34
+ AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
35
+ SCOPES = ["openid", "profile", "User.Read"]
36
+
37
+ # 接続元IP取得
38
+ def get_remote_ip():
39
+ ctx = get_script_run_ctx()
40
+ session_info = runtime.get_instance().get_client(ctx.session_id)
41
+ headers = _get_websocket_headers()
42
+ return session_info.request.remote_ip, headers.get("X-Forwarded-For")
43
+
44
+ # 接続元IP許可判定
45
+ def is_allow_ip_address():
46
+ remote_ip, x_forwarded_for = get_remote_ip()
47
+ logger.info("remote_ip:"+remote_ip)
48
+ if x_forwarded_for is not None:
49
+ remote_ip = x_forwarded_for
50
+ # localhost
51
+ if remote_ip == "::1":
52
+ return True
53
+
54
+ # プライベートIP
55
+ ipaddr = ipaddress.IPv4Address(remote_ip)
56
+ logger.info("ipaddr:"+str(ipaddr))
57
+ if ipaddr.is_private:
58
+ return True
59
+
60
+ # その他(許可リスト判定)
61
+ return remote_ip in ALLOW_IP_ADDRESS
62
+
63
+ #ログインの確認
64
+ def check_login():
65
+ if not is_allow_ip_address():
66
+ st.title("HTTP 403 Forbidden")
67
+ st.stop()
68
+ if "login_token" not in st.session_state or not st.session_state.login_token:
69
+ st.warning("**ログインしてください**")
70
+ st.stop()
71
+
72
+
73
+ INDEX_NAME = os.environ["INDEX_NAME"]
74
+ PKL_NAME = os.environ["PKL_NAME"]
75
+ # デバッグ用
76
+ llm = OpenAI(model='gpt-3.5-turbo', temperature=0.8, max_tokens=256)
77
+ text_splitter = TokenTextSplitter(separator="。", chunk_size=1500
78
+ , chunk_overlap=DEFAULT_CHUNK_OVERLAP
79
+ , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
80
+ node_parser = SimpleNodeParser(text_splitter=text_splitter)
81
+ custom_prompt = Prompt("""\
82
+ 以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
83
+ 会話と新しい会話文に基づいて、検索クエリを作成します。
84
+ 挨拶された場合、挨拶を返してください。
85
+ 答えを知らない場合は、「わかりません」と回答してください。
86
+ 全ての回答は日本語で行ってください。
87
+ 会話履歴:
88
+ {chat_history}
89
+ 新しい会話文:
90
+ {question}
91
+ Search query:
92
+ """)
93
+
94
+ chat_history = []
95
+
96
+ def setChatEngine():
97
+ callback_manager = CallbackManager([st.session_state.llama_debug_handler])
98
+ service_context = ServiceContext.from_defaults(llm=llm,node_parser=node_parser,callback_manager=callback_manager)
99
+ response_synthesizer = get_response_synthesizer(response_mode='refine')
100
+ st.session_state.query_engine = st.session_state.index.as_query_engine(
101
+ response_synthesizer=response_synthesizer,
102
+ service_context=service_context,
103
+ )
104
+ st.session_state.chat_engine = CondenseQuestionChatEngine.from_defaults(
105
+ query_engine=st.session_state.query_engine,
106
+ condense_question_prompt=custom_prompt,
107
+ chat_history=chat_history,
108
+ verbose=True
109
+ )
110
+
111
+ # chat mode reacの記述
112
+ # from langchain.prompts.chat import (
113
+ # ChatPromptTemplate,
114
+ # HumanMessagePromptTemplate,
115
+ # SystemMessagePromptTemplate,
116
+ # )
117
+ # from llama_index.prompts.prompts import RefinePrompt, QuestionAnswerPrompt
118
+ # from llama_index.prompts import Prompt
119
+ # chat_text_qa_msgs = [
120
+ # SystemMessagePromptTemplate.from_template(
121
+ # "文脈が役に立たない場合でも、必ず日本語で質問に答えてください。"
122
+ # ),
123
+ # HumanMessagePromptTemplate.from_template(
124
+ # "以下に、コンテキスト情報を提供します。 \n"
125
+ # "---------------------\n"
126
+ # "{context_str}"
127
+ # "\n---------------------\n"
128
+ # "回答には以下を含めてください。\n"
129
+ # "・最初に問い合わせへのお礼してください\n"
130
+ # "・回答には出典のドキュメント名を含めるようにしてください。\n"
131
+ # "・質問内容を要約してください\n"
132
+ # "・最後に不明な点がないか確認してください \n"
133
+ # "この情報を踏まえて、次の質問に回答してください: {query_str}\n"
134
+ # "答えを知らない場合は、「わからない」と回答してください。また、必ず日本語で回答してください。"
135
+ # ),
136
+ # ]
137
+ # REFINE_PROMPT = ("元の質問は次のとおりです: {query_str} \n"
138
+ # "既存の回答を提供しました: {existing_answer} \n"
139
+ # "既存の答えを洗練する機会があります \n"
140
+ # "(必要な場合のみ)以下にコンテキストを追加します。 \n"
141
+ # "------------\n"
142
+ # "{context_msg}\n"
143
+ # "------------\n"
144
+ # "新しいコンテキストを考慮して、元の答えをより良く洗練して質問に答えてください。\n"
145
+ # "回答には出典のドキュメント名を含めるようにしてください。\n"
146
+ # "コンテキストが役に立たない場合は、元の回答と同じものを返します。"
147
+ # "どのような場合でも、返答は日本語で行います。")
148
+ # refine_prompt = RefinePrompt(REFINE_PROMPT)
149
+
150
+ # def setChatEngine():
151
+ # callback_manager = CallbackManager([st.session_state.llama_debug_handler])
152
+ # service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
153
+ # response_synthesizer = get_response_synthesizer(response_mode='refine')
154
+ # st.session_state.chat_engine = st.session_state.index.as_chat_engine(
155
+ # response_synthesizer=response_synthesizer,
156
+ # service_context=service_context,
157
+ # chat_mode="condense_question",
158
+ # text_qa_template= Prompt.from_langchain_prompt(ChatPromptTemplate.from_messages(chat_text_qa_msgs)),
159
+ # refine_template=refine_prompt,
160
+ # verbose=True
161
+ # )
log.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logging.basicConfig(level=logging.DEBUG)
4
+ logger = logging.getLogger("__name__")
5
+
pages/Chatbot.py CHANGED
@@ -1,39 +1,20 @@
1
 
2
  import streamlit as st
3
- import logging
4
  import common
5
- from llama_index import Prompt
6
-
7
- index_name = "./data/storage"
8
- pkl_name = "./data/stored_documents.pkl"
9
-
10
- custom_prompt = Prompt("""\
11
- 以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
12
- 会話と新しい会話文に基づいて、検索クエリを作成します。回答は日本語で行います。
13
- 新しい会話文が挨拶の場合、挨拶を返してください。
14
- 新しい会話文が質問の場合、検索した結果の回答を返してください。
15
- 答えがわからない場合は正直にわからないと回答してください。
16
- 会話履歴:
17
- {chat_history}
18
- 新しい会話文:
19
- {question}
20
- Search query:
21
- """)
22
-
23
- chat_history = []
24
-
25
- logging.basicConfig(level=logging.INFO)
26
- logger = logging.getLogger("__name__")
27
- logger.debug("調査用ログ")
28
 
 
 
 
29
  common.check_login()
30
 
31
  st.title("💬 Chatbot")
32
  if st.button("リセット",use_container_width=True):
33
  st.session_state.chat_engine.reset()
34
  st.session_state.messages = [{"role": "assistant", "content": "お困りごとはございますか?"}]
35
- st.experimental_rerun()
36
  logger.info("reset")
 
 
37
 
38
  if "messages" not in st.session_state:
39
  st.session_state["messages"] = [{"role": "assistant", "content": "お困りごとはございますか?"}]
@@ -45,6 +26,13 @@ if prompt := st.chat_input():
45
  st.session_state.messages.append({"role": "user", "content": prompt})
46
  st.chat_message("user").write(prompt)
47
  response = st.session_state.chat_engine.chat(prompt)
48
- msg = str(response)
 
 
 
 
 
 
 
49
  st.session_state.messages.append({"role": "assistant", "content": msg})
50
  st.chat_message("assistant").write(msg)
 
1
 
2
  import streamlit as st
 
3
  import common
4
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ INDEX_NAME = os.environ["INDEX_NAME"]
7
+ PKL_NAME = os.environ["PKL_NAME"]
8
+ from log import logger
9
  common.check_login()
10
 
11
  st.title("💬 Chatbot")
12
  if st.button("リセット",use_container_width=True):
13
  st.session_state.chat_engine.reset()
14
  st.session_state.messages = [{"role": "assistant", "content": "お困りごとはございますか?"}]
 
15
  logger.info("reset")
16
+ st.experimental_rerun()
17
+
18
 
19
  if "messages" not in st.session_state:
20
  st.session_state["messages"] = [{"role": "assistant", "content": "お困りごとはございますか?"}]
 
26
  st.session_state.messages.append({"role": "user", "content": prompt})
27
  st.chat_message("user").write(prompt)
28
  response = st.session_state.chat_engine.chat(prompt)
29
+ fname = " ※参照:"
30
+ for node in response.source_nodes:
31
+ logger.info(node)
32
+ if node.node.metadata is not None:
33
+ if "filename" in node.node.metadata:
34
+ fname = fname + " "+str(node.node.metadata["filename"])
35
+ msg = str(response) + str(fname)
36
+ logger.info(msg)
37
  st.session_state.messages.append({"role": "assistant", "content": msg})
38
  st.chat_message("assistant").write(msg)
pages/ChatbotWebRead.py CHANGED
@@ -1,19 +1,21 @@
1
 
2
  import streamlit as st
3
  import faiss
4
- import logging
5
-
6
- from llama_index.callbacks import CallbackManager, LlamaDebugHandler
7
- from llama_index import Prompt, ServiceContext
8
- from llama_index.chat_engine import CondenseQuestionChatEngine;
9
  from llama_index.node_parser import SimpleNodeParser
10
  from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
11
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
12
  from llama_index.response_synthesizers import get_response_synthesizer
13
- from llama_index import ListIndex, SimpleWebPageReader
14
 
 
 
15
  import tiktoken
16
  import common
 
17
 
18
  custom_prompt = Prompt("""\
19
  以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
@@ -27,13 +29,9 @@ custom_prompt = Prompt("""\
27
  {question}
28
  Search query:
29
  """)
30
-
31
  chat_history = []
32
 
33
- logging.basicConfig(level=logging.INFO)
34
- logger = logging.getLogger("__name__")
35
- logger.debug("調査用ログ")
36
-
37
 
38
  common.check_login()
39
 
@@ -45,27 +43,29 @@ URLtext = st.text_input(
45
  )
46
 
47
  if st.button("URL reading",use_container_width=True):
48
- text_splitter = TokenTextSplitter(separator="。", chunk_size=1500
49
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
50
  , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
51
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
52
  d = 1536
53
  k=2
54
  faiss_index = faiss.IndexFlatL2(d)
55
- # デバッグ用
56
- llama_debug_handler = LlamaDebugHandler()
57
- callback_manager = CallbackManager([llama_debug_handler])
58
  service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
59
 
60
  webDocuments = SimpleWebPageReader(html_to_text=True).load_data(
61
  [URLtext]
62
  )
63
  logger.info(webDocuments)
64
- webIndex = ListIndex.from_documents(webDocuments,service_context=service_context)
65
- response_synthesizer = get_response_synthesizer(response_mode='compact')
66
- webQuery_engine = webIndex.as_query_engine(response_synthesizer=response_synthesizer,service_context=service_context)
 
 
 
67
  st.session_state.web_chat_engine = CondenseQuestionChatEngine.from_defaults(
68
- query_engine=webQuery_engine,
69
  condense_question_prompt=custom_prompt,
70
  chat_history=chat_history,
71
  verbose=True
@@ -87,8 +87,7 @@ if prompt := st.chat_input(disabled = not URLtext):
87
  st.session_state.webmessages.append({"role": "user", "content": prompt})
88
  st.chat_message("user").write(prompt)
89
  response = st.session_state.web_chat_engine.chat(prompt)
 
90
  msg = str(response)
91
  st.session_state.webmessages.append({"role": "assistant", "content": msg})
92
  st.chat_message("assistant").write(msg)
93
-
94
-
 
1
 
2
  import streamlit as st
3
  import faiss
4
+ import langchain
5
+ from llama_index.callbacks import CallbackManager
6
+ from llama_index import ServiceContext,VectorStoreIndex
7
+ from llama_index.chat_engine import CondenseQuestionChatEngine
 
8
  from llama_index.node_parser import SimpleNodeParser
9
  from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
10
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
11
  from llama_index.response_synthesizers import get_response_synthesizer
12
+ from llama_index import SimpleWebPageReader
13
 
14
+ # from llama_index.prompts import Prompt
15
+ from llama_index import Prompt
16
  import tiktoken
17
  import common
18
+ langchain.verbose = True
19
 
20
  custom_prompt = Prompt("""\
21
  以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
 
29
  {question}
30
  Search query:
31
  """)
 
32
  chat_history = []
33
 
34
+ from log import logger
 
 
 
35
 
36
  common.check_login()
37
 
 
43
  )
44
 
45
  if st.button("URL reading",use_container_width=True):
46
+ text_splitter = TokenTextSplitter( chunk_size=1500
47
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
48
  , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
49
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
50
  d = 1536
51
  k=2
52
  faiss_index = faiss.IndexFlatL2(d)
53
+
54
+ callback_manager = CallbackManager([st.session_state.llama_debug_handler])
 
55
  service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
56
 
57
  webDocuments = SimpleWebPageReader(html_to_text=True).load_data(
58
  [URLtext]
59
  )
60
  logger.info(webDocuments)
61
+ webIndex = VectorStoreIndex.from_documents(webDocuments,service_context=service_context)
62
+ response_synthesizer = get_response_synthesizer(response_mode='refine')
63
+ st.session_state.webQuery_engine = webIndex.as_query_engine(
64
+ response_synthesizer=response_synthesizer,
65
+ service_context=service_context,
66
+ )
67
  st.session_state.web_chat_engine = CondenseQuestionChatEngine.from_defaults(
68
+ query_engine=st.session_state.webQuery_engine,
69
  condense_question_prompt=custom_prompt,
70
  chat_history=chat_history,
71
  verbose=True
 
87
  st.session_state.webmessages.append({"role": "user", "content": prompt})
88
  st.chat_message("user").write(prompt)
89
  response = st.session_state.web_chat_engine.chat(prompt)
90
+ logger.debug(st.session_state.llama_debug_handler.get_llm_inputs_outputs())
91
  msg = str(response)
92
  st.session_state.webmessages.append({"role": "assistant", "content": msg})
93
  st.chat_message("assistant").write(msg)
 
 
pages/ImportAllFile.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import common
3
+ import os
4
+ import pickle
5
+ from llama_hub.file.cjk_pdf.base import CJKPDFReader
6
+ from llama_hub.file.pptx.base import PptxReader
7
+ from llama_hub.file.pandas_excel.base import PandasExcelReader
8
+ from llama_hub.file.docx.base import DocxReader
9
+ from llama_index import Document, SimpleDirectoryReader
10
+ from pathlib import Path
11
+ from log import logger
12
+ INDEX_NAME = os.environ["INDEX_NAME"]
13
+ PKL_NAME = os.environ["PKL_NAME"]
14
+
15
+ common.check_login()
16
+
17
+ if "file_uploader_key" not in st.session_state:
18
+ st.session_state["file_uploader_key"] = 0
19
+
20
+ st.title("📝 ImportAllFile")
21
+
22
+ uploaded_file = st.file_uploader("Upload an article", type=("txt", "md", "pdf", "xlsx", "docx", "pptx"),key=st.session_state["file_uploader_key"])
23
+ if st.button("import",use_container_width=True):
24
+ filepath = os.path.join('documents', os.path.basename( uploaded_file.name))
25
+ try:
26
+ with open(filepath, 'wb') as f:
27
+ f.write(uploaded_file.getvalue())
28
+ f.close()
29
+
30
+ loader=None
31
+ noextpath,extension = os.path.splitext(filepath)
32
+ logger.info(filepath)
33
+ document = Document()
34
+ if extension == ".txt" or extension ==".md":
35
+ logger.info("extension")
36
+ document = SimpleDirectoryReader(input_files=[filepath], filename_as_id=True).load_data()[0]
37
+ else:
38
+ logger.info("else")
39
+ if extension == ".pdf":
40
+ logger.info("CJKPDFReader")
41
+ loader = CJKPDFReader()
42
+ elif extension == ".pptx":
43
+ logger.info("PptxReader")
44
+ loader = PptxReader()
45
+ elif extension == ".xlsx":
46
+ logger.info("PandasExcelReader")
47
+ loader = PandasExcelReader(pandas_config={"header": 0})
48
+ elif extension == ".docx":
49
+ logger.info("DocxReader")
50
+ loader = DocxReader()
51
+ else:
52
+ logger.error("Can`t read file:" + uploaded_file.name)
53
+ document = loader.load_data(file=Path(filepath))[0]
54
+ document.metadata={'filename': os.path.basename(uploaded_file.name)}
55
+ st.session_state.stored_docs.append(uploaded_file.name)
56
+ logger.info(st.session_state.stored_docs)
57
+ st.session_state.index.insert(document=document)
58
+ st.session_state.index.storage_context.persist(persist_dir=INDEX_NAME)
59
+ os.remove(filepath)
60
+ common.setChatEngine()
61
+ with open(PKL_NAME, "wb") as f:
62
+ print("pickle")
63
+ pickle.dump(st.session_state.stored_docs, f)
64
+ st.session_state["file_uploader_key"] += 1
65
+ st.experimental_rerun()
66
+ except Exception as e:
67
+ # cleanup temp file
68
+ logger.error(e)
69
+ if filepath is not None and os.path.exists(filepath):
70
+ os.remove(filepath)
71
+
72
+ st.subheader("Import File List")
73
+ if "stored_docs" in st.session_state:
74
+ logger.info(st.session_state.stored_docs)
75
+ for docname in st.session_state.stored_docs:
76
+ st.write(docname)