Spaces:

Daoneeee
/

task_Chatbot

Sleeping

App Files Files Community

Daoneeee commited on Nov 21, 2023

Commit

887b79e

1 Parent(s): 080c05e

Update app.py

Browse files

Files changed (4) hide show

app.py +153 -0
facebook_chat.json +217 -0
htmlTemplates.py +44 -0
requirements.txt +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import streamlit as st
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS, Chroma
+from langchain.embeddings import HuggingFaceEmbeddings  # General embeddings from HuggingFace models.
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from htmlTemplates import css, bot_template, user_template
+from langchain.llms import HuggingFaceHub, LlamaCpp, CTransformers  # For loading transformer models.
+from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVLoader
+import tempfile # 임시 파일을 생성하기 위한 라이브러리입니다.
+import os
+# PDF 문서로부터 텍스트를 추출하는 함수입니다.
+def get_pdf_text(pdf_docs):
+    temp_dir = tempfile.TemporaryDirectory() # 임시 디렉토리를 생성합니다.
+    temp_filepath = os.path.join(temp_dir.name, pdf_docs.name) # 임시 파일 경로를 생성합니다.
+    with open(temp_filepath, "wb") as f:  # 임시 파일을 바이너리 쓰기 모드로 엽니다.
+        f.write(pdf_docs.getvalue()) # PDF 문서의 내용을 임시 파일에 씁니다.
+    pdf_loader = PyPDFLoader(temp_filepath) # PyPDFLoader를 사용해 PDF를 로드합니다.
+    pdf_doc = pdf_loader.load() # 텍스트를 추출합니다.
+    return pdf_doc # 추출한 텍스트를 반환합니다.
+# 과제
+# 아래 텍스트 추출 함수를 작성
+def get_text_file(docs):
+    pass
+def get_csv_file(docs):
+    pass
+def get_json_file(docs):
+    pass
+# 문서들을 처리하여 텍스트 청크로 나누는 함수입니다.
+def get_text_chunks(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000, # 청크의 크기를 지정합니다.
+        chunk_overlap=200, # 청크 사이의 중복을 지정합니다.
+        length_function=len # 텍스트의 길이를 측정하는 함수를 지정합니다.
+    )
+    documents = text_splitter.split_documents(documents) # 문서들을 청크로 나눕니다
+    return documents # 나눈 청크를 반환합니다.
+# 텍스트 청크들로부터 벡터 스토어를 생성하는 함수입니다.
+def get_vectorstore(text_chunks):
+    # OpenAI 임베딩 모델을 로드합니다. (Embedding models - Ada v2)
+    embeddings = OpenAIEmbeddings()
+    vectorstore = FAISS.from_documents(text_chunks, embeddings) # FAISS 벡터 스토어를 생성합니다.
+    return vectorstore # 생성된 벡터 스토어를 반환합니다.
+def get_conversation_chain(vectorstore):
+    gpt_model_name = 'gpt-3.5-turbo'
+    llm = ChatOpenAI(model_name = gpt_model_name) #gpt-3.5 모델 로드
+    # 대화 기록을 저장하기 위한 메모리를 생성합니다.
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    # 대화 검색 체인을 생성합니다.
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+# 사용자 입력을 처리하는 함수입니다.
+def handle_userinput(user_question):
+    # 대화 체인을 사용하여 사용자 질문에 대한 응답을 생성합니다.
+    response = st.session_state.conversation({'question': user_question})
+    # 대화 기록을 저장합니다.
+    st.session_state.chat_history = response['chat_history']
+    for i, message in enumerate(st.session_state.chat_history):
+        if i % 2 == 0:
+            st.write(user_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            st.write(bot_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="Chat with multiple Files",
+                       page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
+    if "conversation" not in st.session_state:
+        st.session_state.conversation = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = None
+    st.header("Chat with multiple Files :")
+    user_question = st.text_input("Ask a question about your documents:")
+    if user_question:
+        handle_userinput(user_question)
+    with st.sidebar:
+        openai_key = st.text_input("Paste your OpenAI API key (sk-...)")
+        if openai_key:
+            os.environ["OPENAI_API_KEY"] = openai_key
+        st.subheader("Your documents")
+        docs = st.file_uploader(
+            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+        if st.button("Process"):
+            with st.spinner("Processing"):
+                # get pdf text
+                doc_list = []
+                for file in docs:
+                    print('file - type : ', file.type)
+                    if file.type == 'text/plain':
+                        # file is .txt
+                        doc_list.extend(get_text_file(file))
+                    elif file.type in ['application/octet-stream', 'application/pdf']:
+                        # file is .pdf
+                        doc_list.extend(get_pdf_text(file))
+                    elif file.type == 'text/csv':
+                        # file is .csv
+                        doc_list.extend(get_csv_file(file))
+                    elif file.type == 'application/json':
+                        # file is .json
+                        doc_list.extend(get_json_file(file))
+                # get the text chunks
+                text_chunks = get_text_chunks(doc_list)
+                # create vector store
+                vectorstore = get_vectorstore(text_chunks)
+                # create conversation chain
+                st.session_state.conversation = get_conversation_chain(
+                    vectorstore)
+if __name__ == '__main__':
+    main()

facebook_chat.json ADDED Viewed

	@@ -0,0 +1,217 @@

+#!pip install jq
+from langchain.document_loaders import JSONLoader
+import json
+from pathlib import Path
+from pprint import pprint
+file_path='./example_data/facebook_chat.json'
+data = json.loads(Path(file_path).read_text())
+pprint(data)
+    {'image': {'creation_timestamp': 1675549016, 'uri': 'image_of_the_chat.jpg'},
+     'is_still_participant': True,
+     'joinable_mode': {'link': '', 'mode': 1},
+     'magic_words': [],
+     'messages': [{'content': 'Bye!',
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675597571851},
+                  {'content': 'Oh no worries! Bye',
+                   'sender_name': 'User 1',
+                   'timestamp_ms': 1675597435669},
+                  {'content': 'No Im sorry it was my mistake, the blue one is not '
+                              'for sale',
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675596277579},
+                  {'content': 'I thought you were selling the blue one!',
+                   'sender_name': 'User 1',
+                   'timestamp_ms': 1675595140251},
+                  {'content': 'Im not interested in this bag. Im interested in the '
+                              'blue one!',
+                   'sender_name': 'User 1',
+                   'timestamp_ms': 1675595109305},
+                  {'content': 'Here is $129',
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675595068468},
+                  {'photos': [{'creation_timestamp': 1675595059,
+                               'uri': 'url_of_some_picture.jpg'}],
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675595060730},
+                  {'content': 'Online is at least $100',
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675595045152},
+                  {'content': 'How much do you want?',
+                   'sender_name': 'User 1',
+                   'timestamp_ms': 1675594799696},
+                  {'content': 'Goodmorning! $50 is too low.',
+                   'sender_name': 'User 2',
+                   'timestamp_ms': 1675577876645},
+                  {'content': 'Hi! Im interested in your bag. Im offering $50. Let '
+                              'me know if you are interested. Thanks!',
+                   'sender_name': 'User 1',
+                   'timestamp_ms': 1675549022673}],
+     'participants': [{'name': 'User 1'}, {'name': 'User 2'}],
+     'thread_path': 'inbox/User 1 and User 2 chat',
+     'title': 'User 1 and User 2 chat'}
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat.json',
+    jq_schema='.messages[].content',
+    text_content=False)
+data = loader.load()
+pprint(data)
+    [Document(page_content='Bye!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1}),
+     Document(page_content='Oh no worries! Bye', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2}),
+     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3}),
+     Document(page_content='I thought you were selling the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4}),
+     Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5}),
+     Document(page_content='Here is $129', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6}),
+     Document(page_content='', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7}),
+     Document(page_content='Online is at least $100', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8}),
+     Document(page_content='How much do you want?', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9}),
+     Document(page_content='Goodmorning! $50 is too low.', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10}),
+     Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11})]
+file_path = './example_data/facebook_chat_messages.jsonl'
+pprint(Path(file_path).read_text())
+    ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
+     '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
+     'worries! Bye"}\n'
+     '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
+     'sorry it was my mistake, the blue one is not for sale"}\n')
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat_messages.jsonl',
+    jq_schema='.content',
+    text_content=False,
+    json_lines=True)
+data = loader.load()
+pprint(data)
+    [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+     Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat_messages.jsonl',
+    jq_schema='.',
+    content_key='sender_name',
+    json_lines=True)
+data = loader.load()
+pprint(data)
+    [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+     Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+     Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+.messages[].content
+.messages[]
+# Define the metadata extraction function.
+def metadata_func(record: dict, metadata: dict) -> dict:
+    metadata["sender_name"] = record.get("sender_name")
+    metadata["timestamp_ms"] = record.get("timestamp_ms")
+    return metadata
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat.json',
+    jq_schema='.messages[]',
+    content_key="content",
+    metadata_func=metadata_func
+)
+data = loader.load()
+pprint(data)
+    [Document(page_content='Bye!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}),
+     Document(page_content='Oh no worries! Bye', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2, 'sender_name': 'User 1', 'timestamp_ms': 1675597435669}),
+     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3, 'sender_name': 'User 2', 'timestamp_ms': 1675596277579}),
+     Document(page_content='I thought you were selling the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4, 'sender_name': 'User 1', 'timestamp_ms': 1675595140251}),
+     Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5, 'sender_name': 'User 1', 'timestamp_ms': 1675595109305}),
+     Document(page_content='Here is $129', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6, 'sender_name': 'User 2', 'timestamp_ms': 1675595068468}),
+     Document(page_content='', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7, 'sender_name': 'User 2', 'timestamp_ms': 1675595060730}),
+     Document(page_content='Online is at least $100', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8, 'sender_name': 'User 2', 'timestamp_ms': 1675595045152}),
+     Document(page_content='How much do you want?', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9, 'sender_name': 'User 1', 'timestamp_ms': 1675594799696}),
+     Document(page_content='Goodmorning! $50 is too low.', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10, 'sender_name': 'User 2', 'timestamp_ms': 1675577876645}),
+     Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11, 'sender_name': 'User 1', 'timestamp_ms': 1675549022673})]
+# Define the metadata extraction function.
+def metadata_func(record: dict, metadata: dict) -> dict:
+    metadata["sender_name"] = record.get("sender_name")
+    metadata["timestamp_ms"] = record.get("timestamp_ms")
+    if "source" in metadata:
+        source = metadata["source"].split("/")
+        source = source[source.index("langchain"):]
+        metadata["source"] = "/".join(source)
+    return metadata
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat.json',
+    jq_schema='.messages[]',
+    content_key="content",
+    metadata_func=metadata_func
+)
+data = loader.load()
+pprint(data)
+    [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}),
+     Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2, 'sender_name': 'User 1', 'timestamp_ms': 1675597435669}),
+     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3, 'sender_name': 'User 2', 'timestamp_ms': 1675596277579}),
+     Document(page_content='I thought you were selling the blue one!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4, 'sender_name': 'User 1', 'timestamp_ms': 1675595140251}),
+     Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5, 'sender_name': 'User 1', 'timestamp_ms': 1675595109305}),
+     Document(page_content='Here is $129', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6, 'sender_name': 'User 2', 'timestamp_ms': 1675595068468}),
+     Document(page_content='', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7, 'sender_name': 'User 2', 'timestamp_ms': 1675595060730}),
+     Document(page_content='Online is at least $100', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8, 'sender_name': 'User 2', 'timestamp_ms': 1675595045152}),
+     Document(page_content='How much do you want?', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9, 'sender_name': 'User 1', 'timestamp_ms': 1675594799696}),
+     Document(page_content='Goodmorning! $50 is too low.', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10, 'sender_name': 'User 2', 'timestamp_ms': 1675577876645}),
+     Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11, 'sender_name': 'User 1', 'timestamp_ms': 1675549022673})]
+JSON        -> [{"text": ...}, {"text": ...}, {"text": ...}]
+jq_schema   -> ".[].text"
+JSON        -> {"key": [{"text": ...}, {"text": ...}, {"text": ...}]}
+jq_schema   -> ".key[].text"
+JSON        -> ["...", "...", "..."]
+jq_schema   -> ".[]"

htmlTemplates.py ADDED Viewed

	@@ -0,0 +1,44 @@

+css = '''
+<style>
+.chat-message {
+    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
+}
+.chat-message.user {
+    background-color: #2b313e
+}
+.chat-message.bot {
+    background-color: #475063
+}
+.chat-message .avatar {
+  width: 20%;
+}
+.chat-message .avatar img {
+  max-width: 78px;
+  max-height: 78px;
+  border-radius: 50%;
+  object-fit: cover;
+}
+.chat-message .message {
+  width: 80%;
+  padding: 0 1.5rem;
+  color: #fff;
+}
+'''
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+langchain
+llama-cpp-python
+PyPDF2==3.0.1
+faiss-cpu==1.7.4
+ctransformers
+pypdf
+chromadb
+tiktoken
+pysqlite3-binary
+streamlit-extras
+InstructorEmbedding
+sentence-transformers
+jq
+openai