naotakigawa commited on
Commit
02e27e1
·
1 Parent(s): 67691d2

Upload 6 files

Browse files
Files changed (5) hide show
  1. app.py +64 -22
  2. common.py +26 -58
  3. pages/Chatbot.py +12 -5
  4. pages/ImportAllFile.py +70 -0
  5. requirements.txt +20 -20
app.py CHANGED
@@ -3,11 +3,12 @@ import os
3
  import pickle
4
  import faiss
5
  import common
6
-
7
  from multiprocessing import Lock
8
  from multiprocessing.managers import BaseManager
 
9
  from llama_index.callbacks import CallbackManager, LlamaDebugHandler
10
- from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
11
  from llama_index.node_parser import SimpleNodeParser
12
  from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
13
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
@@ -16,6 +17,11 @@ from llama_index.graph_stores import SimpleGraphStore
16
  from llama_index.storage.docstore import SimpleDocumentStore
17
  from llama_index.storage.index_store import SimpleIndexStore
18
  from msal_streamlit_authentication import msal_authentication
 
 
 
 
 
19
  import tiktoken
20
  from llama_index.callbacks import CallbackManager, LlamaDebugHandler
21
  from dotenv import load_dotenv
@@ -35,14 +41,15 @@ AUTHORITY = f"https://login.microsoftonline.com/{TENANT_ID}"
35
  REDIRECT_URI = os.environ["REDIRECT_URI"]
36
  SCOPES = ["openid", "profile", "User.Read"]
37
 
38
- index_name = os.environ["INDEX_NAME"]
39
- pkl_name = os.environ["PKL_NAME"]
40
  st.session_state.llama_debug_handler = LlamaDebugHandler()
41
  from log import logger
42
 
43
  def initialize_index():
44
  logger.info("initialize_index start")
45
- text_splitter = TokenTextSplitter(chunk_size=1500
 
46
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
47
  , tokenizer=tiktoken.encoding_for_model("gpt-4").encode)
48
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
@@ -51,34 +58,57 @@ def initialize_index():
51
  faiss_index = faiss.IndexFlatL2(d)
52
  # デバッグ用
53
  callback_manager = CallbackManager([st.session_state.llama_debug_handler])
54
- service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
55
  lock = Lock()
56
  with lock:
57
- if os.path.exists(index_name):
58
  logger.info("start import index")
59
  storage_context = StorageContext.from_defaults(
60
- docstore=SimpleDocumentStore.from_persist_dir(persist_dir=index_name),
61
- graph_store=SimpleGraphStore.from_persist_dir(persist_dir=index_name),
62
- vector_store=FaissVectorStore.from_persist_dir(persist_dir=index_name),
63
- index_store=SimpleIndexStore.from_persist_dir(persist_dir=index_name),
64
  )
65
  st.session_state.index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
 
 
66
  common.setChatEngine()
67
  else:
68
  logger.info("start create index")
69
- documents = SimpleDirectoryReader("./documents").load_data()
 
70
  vector_store = FaissVectorStore(faiss_index=faiss_index)
71
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
72
- st.session_state.index = VectorStoreIndex.from_documents(documents, storage_context=storage_context,service_context=service_context)
73
- st.session_state.index.storage_context.persist(persist_dir=index_name)
74
- common.setChatEngine()
75
- if os.path.exists(pkl_name):
76
- logger.info(pkl_name)
77
- with open(pkl_name, "rb") as f:
78
- st.session_state.stored_docs = pickle.load(f)
79
- else:
80
  st.session_state.stored_docs=list()
81
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  def logout():
84
  st.session_state["login_token"] = None
@@ -110,4 +140,16 @@ st.session_state["login_token"] = msal_authentication(
110
  if st.session_state.login_token:
111
  initialize_index()
112
  st.write("ようこそ", st.session_state.login_token["account"]["name"])
113
- st.write("サイドメニューからファイルインポート又はChatbotへの質問を開始してください。")
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pickle
4
  import faiss
5
  import common
6
+ import glob
7
  from multiprocessing import Lock
8
  from multiprocessing.managers import BaseManager
9
+ from pathlib import Path
10
  from llama_index.callbacks import CallbackManager, LlamaDebugHandler
11
+ from llama_index import Document,VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
12
  from llama_index.node_parser import SimpleNodeParser
13
  from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
14
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
 
17
  from llama_index.storage.docstore import SimpleDocumentStore
18
  from llama_index.storage.index_store import SimpleIndexStore
19
  from msal_streamlit_authentication import msal_authentication
20
+ from llama_hub.file.cjk_pdf.base import CJKPDFReader
21
+ from llama_hub.file.pptx.base import PptxReader
22
+ from llama_hub.file.pandas_excel.base import PandasExcelReader
23
+ from llama_hub.file.docx.base import DocxReader
24
+ from llama_index.llms import OpenAI
25
  import tiktoken
26
  from llama_index.callbacks import CallbackManager, LlamaDebugHandler
27
  from dotenv import load_dotenv
 
41
  REDIRECT_URI = os.environ["REDIRECT_URI"]
42
  SCOPES = ["openid", "profile", "User.Read"]
43
 
44
+ INDEX_NAME = os.environ["INDEX_NAME"]
45
+ PKL_NAME = os.environ["PKL_NAME"]
46
  st.session_state.llama_debug_handler = LlamaDebugHandler()
47
  from log import logger
48
 
49
  def initialize_index():
50
  logger.info("initialize_index start")
51
+ llm = OpenAI(model='gpt-4', temperature=0.8, max_tokens=256)
52
+ text_splitter = TokenTextSplitter(separator="。",chunk_size=1500
53
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
54
  , tokenizer=tiktoken.encoding_for_model("gpt-4").encode)
55
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
 
58
  faiss_index = faiss.IndexFlatL2(d)
59
  # デバッグ用
60
  callback_manager = CallbackManager([st.session_state.llama_debug_handler])
61
+ service_context = ServiceContext.from_defaults(llm=llm,node_parser=node_parser,callback_manager=callback_manager)
62
  lock = Lock()
63
  with lock:
64
+ if os.path.exists(INDEX_NAME):
65
  logger.info("start import index")
66
  storage_context = StorageContext.from_defaults(
67
+ docstore=SimpleDocumentStore.from_persist_dir(persist_dir=INDEX_NAME),
68
+ graph_store=SimpleGraphStore.from_persist_dir(persist_dir=INDEX_NAME),
69
+ vector_store=FaissVectorStore.from_persist_dir(persist_dir=INDEX_NAME),
70
+ index_store=SimpleIndexStore.from_persist_dir(persist_dir=INDEX_NAME),
71
  )
72
  st.session_state.index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
73
+ with open(PKL_NAME, "rb") as f:
74
+ st.session_state.stored_docs = pickle.load(f)
75
  common.setChatEngine()
76
  else:
77
  logger.info("start create index")
78
+ documents = list()
79
+ files = glob.glob("./documents/*")
80
  vector_store = FaissVectorStore(faiss_index=faiss_index)
81
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
 
 
 
 
 
 
 
 
82
  st.session_state.stored_docs=list()
83
+ for file in files:
84
+ loader=None
85
+ noextpath,extension = os.path.splitext(file)
86
+ logger.info(file)
87
+ document = Document()
88
+ if extension == ".txt" or ".md":
89
+ document = SimpleDirectoryReader(input_files=[file], filename_as_id=True).load_data()[0]
90
+ else:
91
+ if extension == ".pdf":
92
+ loader = CJKPDFReader()
93
+ elif extension == ".pptx":
94
+ loader = PptxReader()
95
+ elif extension == ".xlsx":
96
+ loader = PandasExcelReader(pandas_config={"header": 0})
97
+ elif extension == ".docx":
98
+ loader = DocxReader()
99
+ else:
100
+ logger.error("Can`t read file:" + file)
101
+ continue
102
+ document = loader.load_data(file=Path(file))[0]
103
+ document.metadata={'filename': os.path.basename(file)}
104
+ documents.append(document)
105
+ st.session_state.stored_docs.append(os.path.basename(file))
106
+ st.session_state.index = VectorStoreIndex.from_documents( documents=documents,storage_context=storage_context,service_context=service_context)
107
+ st.session_state.index.storage_context.persist(persist_dir=INDEX_NAME)
108
+ with open(PKL_NAME, "wb") as f:
109
+ print("pickle")
110
+ pickle.dump(st.session_state.stored_docs, f)
111
+ common.setChatEngine()
112
 
113
  def logout():
114
  st.session_state["login_token"] = None
 
140
  if st.session_state.login_token:
141
  initialize_index()
142
  st.write("ようこそ", st.session_state.login_token["account"]["name"])
143
+ st.write("サイドメニューからファイルインポート又はChatbotへの質問を開始してください。")
144
+ st.markdown("""
145
+ ## 使い方
146
+ - **Chatbot**
147
+ 初期からインポートされているファイルとImportXXFileでインポートしたファイルの内容に関する質問に対して、GenerativeAIが回答します。
148
+
149
+ - **ChatbotWebRead**
150
+ 入力したURLのサイトの情報に関して、GenerativeAIが回答します。
151
+ ImportXXFileの内容は登録されていません。
152
+
153
+ - **ImportAllFile**
154
+ テキストファイル,mdファイル,Excel,PDF,PowerPoint,Wordをインポートできます。
155
+ """)
common.py CHANGED
@@ -18,6 +18,7 @@ from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
18
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
19
  from llama_index.response_synthesizers import get_response_synthesizer
20
  from llama_index.callbacks import CallbackManager
 
21
  from log import logger
22
 
23
  # 接続元制御
@@ -69,12 +70,13 @@ def check_login():
69
  st.stop()
70
 
71
 
72
- index_name = os.environ["INDEX_NAME"]
73
- pkl_name = os.environ["PKL_NAME"]
74
  # デバッグ用
75
- text_splitter = TokenTextSplitter( chunk_size=1500
 
76
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
77
- , tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
78
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
79
  custom_prompt = Prompt("""\
80
  以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
@@ -91,59 +93,10 @@ custom_prompt = Prompt("""\
91
  """)
92
 
93
  chat_history = []
94
- def fileImportChatEngine(uploaded_file):
95
- filepath = None
96
- try:
97
- filepath = os.path.join('documents', os.path.basename( uploaded_file.name))
98
- logger.info(filepath)
99
- with open(filepath, 'wb') as f:
100
- f.write(uploaded_file.getvalue())
101
- f.close()
102
- document = SimpleDirectoryReader(input_files=[filepath]).load_data()[0]
103
- st.session_state.stored_docs.append(uploaded_file.name)
104
- logger.info(st.session_state.stored_docs)
105
- st.session_state.index.insert(document=document)
106
- st.session_state.index.storage_context.persist(persist_dir=index_name)
107
- setChatEngine()
108
- with open(pkl_name, "wb") as f:
109
- print("pickle")
110
- pickle.dump(st.session_state.stored_docs, f)
111
- st.session_state["file_uploader_key"] += 1
112
- st.experimental_rerun()
113
- except Exception as e:
114
- # cleanup temp file
115
- logger.error(e)
116
- if filepath is not None and os.path.exists(filepath):
117
- os.remove(filepath)
118
-
119
- def fileImportChatEngineCustomloader(uploaded_file,loader):
120
- filepath = None
121
- try:
122
- filepath = os.path.join('documents', os.path.basename( uploaded_file.name))
123
- logger.info(filepath)
124
- with open(filepath, 'wb') as f:
125
- f.write(uploaded_file.getvalue())
126
- f.close()
127
- document = loader.load_data(file=Path(filepath))[0]
128
- st.session_state.stored_docs.append(uploaded_file.name)
129
- logger.info(st.session_state.stored_docs)
130
- st.session_state.index.insert(document=document)
131
- st.session_state.index.storage_context.persist(persist_dir=index_name)
132
- setChatEngine()
133
- with open(pkl_name, "wb") as f:
134
- print("pickle")
135
- pickle.dump(st.session_state.stored_docs, f)
136
- st.session_state["file_uploader_key"] += 1
137
- st.experimental_rerun()
138
- except Exception as e:
139
- # cleanup temp file
140
- logger.error(e)
141
- if filepath is not None and os.path.exists(filepath):
142
- os.remove(filepath)
143
 
144
  def setChatEngine():
145
  callback_manager = CallbackManager([st.session_state.llama_debug_handler])
146
- service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
147
  response_synthesizer = get_response_synthesizer(response_mode='refine')
148
  st.session_state.query_engine = st.session_state.index.as_query_engine(
149
  response_synthesizer=response_synthesizer,
@@ -162,10 +115,11 @@ def setChatEngine():
162
  # HumanMessagePromptTemplate,
163
  # SystemMessagePromptTemplate,
164
  # )
 
165
  # from llama_index.prompts import Prompt
166
  # chat_text_qa_msgs = [
167
  # SystemMessagePromptTemplate.from_template(
168
- # "文脈が役に立たない場合でも、必ず質問に答えてください。"
169
  # ),
170
  # HumanMessagePromptTemplate.from_template(
171
  # "以下に、コンテキスト情報を提供します。 \n"
@@ -174,13 +128,26 @@ def setChatEngine():
174
  # "\n---------------------\n"
175
  # "回答には以下を含めてください。\n"
176
  # "・最初に問い合わせへのお礼してください\n"
177
- # "・自己紹介してください\n"
178
  # "・質問内容を要約してください\n"
179
  # "・最後に不明な点がないか確認してください \n"
180
  # "この情報を踏まえて、次の質問に回答して��ださい: {query_str}\n"
181
- # "答えを知らない場合は、「わからない」と回答してください。また、日本語で回答してください。"
182
  # ),
183
  # ]
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # def setChatEngine():
185
  # callback_manager = CallbackManager([st.session_state.llama_debug_handler])
186
  # service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
@@ -188,7 +155,8 @@ def setChatEngine():
188
  # st.session_state.chat_engine = st.session_state.index.as_chat_engine(
189
  # response_synthesizer=response_synthesizer,
190
  # service_context=service_context,
191
- # chat_mode="react",
192
  # text_qa_template= Prompt.from_langchain_prompt(ChatPromptTemplate.from_messages(chat_text_qa_msgs)),
 
193
  # verbose=True
194
  # )
 
18
  from llama_index.constants import DEFAULT_CHUNK_OVERLAP
19
  from llama_index.response_synthesizers import get_response_synthesizer
20
  from llama_index.callbacks import CallbackManager
21
+ from llama_index.llms import OpenAI
22
  from log import logger
23
 
24
  # 接続元制御
 
70
  st.stop()
71
 
72
 
73
+ INDEX_NAME = os.environ["INDEX_NAME"]
74
+ PKL_NAME = os.environ["PKL_NAME"]
75
  # デバッグ用
76
+ llm = OpenAI(model='gpt-4', temperature=0.8, max_tokens=256)
77
+ text_splitter = TokenTextSplitter(separator="。", chunk_size=1500
78
  , chunk_overlap=DEFAULT_CHUNK_OVERLAP
79
+ , tokenizer=tiktoken.encoding_for_model("gpt-4").encode)
80
  node_parser = SimpleNodeParser(text_splitter=text_splitter)
81
  custom_prompt = Prompt("""\
82
  以下はこれまでの会話履歴と、ドキュメントを検索して回答する必要がある、ユーザーからの会話文です。
 
93
  """)
94
 
95
  chat_history = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def setChatEngine():
98
  callback_manager = CallbackManager([st.session_state.llama_debug_handler])
99
+ service_context = ServiceContext.from_defaults(llm=llm,node_parser=node_parser,callback_manager=callback_manager)
100
  response_synthesizer = get_response_synthesizer(response_mode='refine')
101
  st.session_state.query_engine = st.session_state.index.as_query_engine(
102
  response_synthesizer=response_synthesizer,
 
115
  # HumanMessagePromptTemplate,
116
  # SystemMessagePromptTemplate,
117
  # )
118
+ # from llama_index.prompts.prompts import RefinePrompt, QuestionAnswerPrompt
119
  # from llama_index.prompts import Prompt
120
  # chat_text_qa_msgs = [
121
  # SystemMessagePromptTemplate.from_template(
122
+ # "文脈が役に立たない場合でも、必ず日本語で質問に答えてください。"
123
  # ),
124
  # HumanMessagePromptTemplate.from_template(
125
  # "以下に、コンテキスト情報を提供します。 \n"
 
128
  # "\n---------------------\n"
129
  # "回答には以下を含めてください。\n"
130
  # "・最初に問い合わせへのお礼してください\n"
131
+ # "・回答には出典のドキュメント名を含めるようにしてください。\n"
132
  # "・質問内容を要約してください\n"
133
  # "・最後に不明な点がないか確認してください \n"
134
  # "この情報を踏まえて、次の質問に回答して��ださい: {query_str}\n"
135
+ # "答えを知らない場合は、「わからない」と回答してください。また、必ず日本語で回答してください。"
136
  # ),
137
  # ]
138
+ # REFINE_PROMPT = ("元の質問は次のとおりです: {query_str} \n"
139
+ # "既存の回答を提供しました: {existing_answer} \n"
140
+ # "既存の答えを洗練する機会があります \n"
141
+ # "(必要な場合のみ)以下にコンテキストを追加します。 \n"
142
+ # "------------\n"
143
+ # "{context_msg}\n"
144
+ # "------------\n"
145
+ # "新しいコンテキストを考慮して、元の答えをより良く洗練して質問に答えてください。\n"
146
+ # "回答には出典のドキュメント名を含めるようにしてください。\n"
147
+ # "コンテキストが役に立たない場合は、元の回答と同じものを返します。"
148
+ # "どのような場合でも、返答は日本語で行います。")
149
+ # refine_prompt = RefinePrompt(REFINE_PROMPT)
150
+
151
  # def setChatEngine():
152
  # callback_manager = CallbackManager([st.session_state.llama_debug_handler])
153
  # service_context = ServiceContext.from_defaults(node_parser=node_parser,callback_manager=callback_manager)
 
155
  # st.session_state.chat_engine = st.session_state.index.as_chat_engine(
156
  # response_synthesizer=response_synthesizer,
157
  # service_context=service_context,
158
+ # chat_mode="condense_question",
159
  # text_qa_template= Prompt.from_langchain_prompt(ChatPromptTemplate.from_messages(chat_text_qa_msgs)),
160
+ # refine_template=refine_prompt,
161
  # verbose=True
162
  # )
pages/Chatbot.py CHANGED
@@ -3,8 +3,8 @@ import streamlit as st
3
  import common
4
  import os
5
 
6
- index_name = os.environ["INDEX_NAME"]
7
- pkl_name = os.environ["PKL_NAME"]
8
  from log import logger
9
  common.check_login()
10
 
@@ -12,8 +12,9 @@ st.title("💬 Chatbot")
12
  if st.button("リセット",use_container_width=True):
13
  st.session_state.chat_engine.reset()
14
  st.session_state.messages = [{"role": "assistant", "content": "お困りごとはございますか?"}]
15
- st.experimental_rerun()
16
  logger.info("reset")
 
 
17
 
18
  if "messages" not in st.session_state:
19
  st.session_state["messages"] = [{"role": "assistant", "content": "お困りごとはございますか?"}]
@@ -25,7 +26,13 @@ if prompt := st.chat_input():
25
  st.session_state.messages.append({"role": "user", "content": prompt})
26
  st.chat_message("user").write(prompt)
27
  response = st.session_state.chat_engine.chat(prompt)
28
- # logger.info(st.session_state.llama_debug_handler.get_llm_inputs_outputs()[-1][-1])
29
- msg = str(response)
 
 
 
 
 
 
30
  st.session_state.messages.append({"role": "assistant", "content": msg})
31
  st.chat_message("assistant").write(msg)
 
3
  import common
4
  import os
5
 
6
+ INDEX_NAME = os.environ["INDEX_NAME"]
7
+ PKL_NAME = os.environ["PKL_NAME"]
8
  from log import logger
9
  common.check_login()
10
 
 
12
  if st.button("リセット",use_container_width=True):
13
  st.session_state.chat_engine.reset()
14
  st.session_state.messages = [{"role": "assistant", "content": "お困りごとはございますか?"}]
 
15
  logger.info("reset")
16
+ st.experimental_rerun()
17
+
18
 
19
  if "messages" not in st.session_state:
20
  st.session_state["messages"] = [{"role": "assistant", "content": "お困りごとはございますか?"}]
 
26
  st.session_state.messages.append({"role": "user", "content": prompt})
27
  st.chat_message("user").write(prompt)
28
  response = st.session_state.chat_engine.chat(prompt)
29
+ fname = " ※参照:"
30
+ for node in response.source_nodes:
31
+ logger.info(node)
32
+ if node.node.metadata is not None:
33
+ if "filename" in node.node.metadata:
34
+ fname = fname + " "+str(node.node.metadata["filename"])
35
+ msg = str(response) + str(fname)
36
+ logger.info(msg)
37
  st.session_state.messages.append({"role": "assistant", "content": msg})
38
  st.chat_message("assistant").write(msg)
pages/ImportAllFile.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import common
3
+ import os
4
+ import pickle
5
+ from llama_hub.file.cjk_pdf.base import CJKPDFReader
6
+ from llama_hub.file.pptx.base import PptxReader
7
+ from llama_hub.file.pandas_excel.base import PandasExcelReader
8
+ from llama_hub.file.docx.base import DocxReader
9
+ from llama_index import Document, SimpleDirectoryReader
10
+ from pathlib import Path
11
+ from log import logger
12
+ INDEX_NAME = os.environ["INDEX_NAME"]
13
+ PKL_NAME = os.environ["PKL_NAME"]
14
+
15
+ common.check_login()
16
+
17
+ if "file_uploader_key" not in st.session_state:
18
+ st.session_state["file_uploader_key"] = 0
19
+
20
+ st.title("📝 ImportAllFile")
21
+
22
+ uploaded_file = st.file_uploader("Upload an article", type=("txt", "md", "pdf", "xlsx", "docx", "pptx"),key=st.session_state["file_uploader_key"])
23
+ if st.button("import",use_container_width=True):
24
+ filepath = os.path.join('documents', os.path.basename( uploaded_file.name))
25
+ try:
26
+ with open(filepath, 'wb') as f:
27
+ f.write(uploaded_file.getvalue())
28
+ f.close()
29
+
30
+ loader=None
31
+ noextpath,extension = os.path.splitext(filepath)
32
+ logger.info(filepath)
33
+ document = Document()
34
+ if extension == ".txt" or ".md":
35
+ document = SimpleDirectoryReader(input_files=[filepath], filename_as_id=True).load_data()[0]
36
+ else:
37
+ if extension == ".pdf":
38
+ loader = CJKPDFReader()
39
+ elif extension == ".pptx":
40
+ loader = PptxReader()
41
+ elif extension == ".xlsx":
42
+ loader = PandasExcelReader(pandas_config={"header": 0})
43
+ elif extension == ".docx":
44
+ loader = DocxReader()
45
+ else:
46
+ logger.error("Can`t read file:" + uploaded_file.name)
47
+ document = loader.load_data(file=Path(filepath))[0]
48
+ document.metadata={'filename': os.path.basename(uploaded_file.name)}
49
+ st.session_state.stored_docs.append(uploaded_file.name)
50
+ logger.info(st.session_state.stored_docs)
51
+ st.session_state.index.insert(document=document)
52
+ st.session_state.index.storage_context.persist(persist_dir=INDEX_NAME)
53
+ os.remove(filepath)
54
+ common.setChatEngine()
55
+ with open(PKL_NAME, "wb") as f:
56
+ print("pickle")
57
+ pickle.dump(st.session_state.stored_docs, f)
58
+ st.session_state["file_uploader_key"] += 1
59
+ st.experimental_rerun()
60
+ except Exception as e:
61
+ # cleanup temp file
62
+ logger.error(e)
63
+ if filepath is not None and os.path.exists(filepath):
64
+ os.remove(filepath)
65
+
66
+ st.subheader("Import File List")
67
+ if "stored_docs" in st.session_state:
68
+ logger.info(st.session_state.stored_docs)
69
+ for docname in st.session_state.stored_docs:
70
+ st.write(docname)
requirements.txt CHANGED
@@ -1,23 +1,23 @@
1
- streamlit>=1.24.0
2
- langchain>=0.0.217
3
- openai
4
- duckduckgo-search
5
- anthropic
6
- nltk
7
  llama-index==0.8.4
8
  pypdf==3.9.0
9
  faiss-cpu==1.7.4
10
- html2text
11
- streamlit-authenticator
12
- extra_streamlit_components
13
- requests_oauthlib
14
- python-dotenv
15
- torch
16
- transformers
17
- python-pptx
18
- Pillow
19
- openpyxl
20
- llama_hub
21
- msal-streamlit-authentication
22
- pdfminer.six
23
- docx2txt
 
1
+ streamlit==1.25.0
2
+ langchain==0.0.266
3
+ openai==0.27.9
4
+ duckduckgo-search==3.8.5
5
+ anthropic==0.3.10
6
+ nltk==3.8.1
7
  llama-index==0.8.4
8
  pypdf==3.9.0
9
  faiss-cpu==1.7.4
10
+ html2text==2020.1.16
11
+ streamlit-authenticator==0.2.2
12
+ extra_streamlit_components==0.1.56
13
+ requests_oauthlib==1.3.1
14
+ python-dotenv==1.0.0
15
+ torch==2.0.1
16
+ transformers==4.32.0
17
+ python-pptx==0.6.21
18
+ Pillow==9.5.0
19
+ openpyxl==3.1.2
20
+ llama_hub==0.0.25
21
+ msal-streamlit-authentication==1.0.9
22
+ pdfminer.six==20221105
23
+ docx2txt==0.8