qatool / pages /ImportAllFile.py
naotakigawa's picture
Upload 6 files
8b16906
raw
history blame
3.01 kB
import streamlit as st
import common
import os
import pickle
from llama_hub.file.cjk_pdf.base import CJKPDFReader
from llama_hub.file.pptx.base import PptxReader
from llama_hub.file.pandas_excel.base import PandasExcelReader
from llama_hub.file.docx.base import DocxReader
from llama_index import Document, SimpleDirectoryReader
from pathlib import Path
from log import logger
INDEX_NAME = os.environ["INDEX_NAME"]
PKL_NAME = os.environ["PKL_NAME"]
common.check_login()
if "file_uploader_key" not in st.session_state:
st.session_state["file_uploader_key"] = 0
st.title("📝 ImportAllFile")
uploaded_file = st.file_uploader("Upload an article", type=("txt", "md", "pdf", "xlsx", "docx", "pptx"),key=st.session_state["file_uploader_key"])
if st.button("import",use_container_width=True):
filepath = os.path.join('documents', os.path.basename( uploaded_file.name))
try:
with open(filepath, 'wb') as f:
f.write(uploaded_file.getvalue())
f.close()
loader=None
noextpath,extension = os.path.splitext(filepath)
logger.info(filepath)
document = Document()
if extension == ".txt" or extension ==".md":
logger.info("extension")
document = SimpleDirectoryReader(input_files=[filepath], filename_as_id=True).load_data()[0]
else:
logger.info("else")
if extension == ".pdf":
logger.info("CJKPDFReader")
loader = CJKPDFReader()
elif extension == ".pptx":
logger.info("PptxReader")
loader = PptxReader()
elif extension == ".xlsx":
logger.info("PandasExcelReader")
loader = PandasExcelReader(pandas_config={"header": 0})
elif extension == ".docx":
logger.info("DocxReader")
loader = DocxReader()
else:
logger.error("Can`t read file:" + uploaded_file.name)
document = loader.load_data(file=Path(filepath))[0]
document.metadata={'filename': os.path.basename(uploaded_file.name)}
st.session_state.stored_docs.append(uploaded_file.name)
logger.info(st.session_state.stored_docs)
st.session_state.index.insert(document=document)
st.session_state.index.storage_context.persist(persist_dir=INDEX_NAME)
os.remove(filepath)
common.setChatEngine()
with open(PKL_NAME, "wb") as f:
print("pickle")
pickle.dump(st.session_state.stored_docs, f)
st.session_state["file_uploader_key"] += 1
st.experimental_rerun()
except Exception as e:
# cleanup temp file
logger.error(e)
if filepath is not None and os.path.exists(filepath):
os.remove(filepath)
st.subheader("Import File List")
if "stored_docs" in st.session_state:
logger.info(st.session_state.stored_docs)
for docname in st.session_state.stored_docs:
st.write(docname)