araeyn commited on
Commit
5de72d6
·
1 Parent(s): 4109df4
Files changed (6) hide show
  1. app.py +131 -0
  2. chs.json +0 -0
  3. crawler.py +0 -0
  4. database.zip +3 -0
  5. parse.py +67 -0
  6. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from websockets.server import serve
4
+
5
+ from langchain.vectorstores import Chroma
6
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_huggingface.llms import HuggingFaceEndpoint
9
+ from langchain.document_loaders import TextLoader
10
+ from langchain.document_loaders import DirectoryLoader
11
+ from langchain import hub
12
+ from langchain_core.runnables import RunnablePassthrough
13
+ from langchain_core.output_parsers import StrOutputParser
14
+ from langchain.chains import create_history_aware_retriever
15
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
16
+ from langchain.chains import create_retrieval_chain
17
+ from langchain.chains.combine_documents import create_stuff_documents_chain
18
+ from langchain_core.runnables.history import RunnableWithMessageHistory
19
+ from langchain_core.chat_history import BaseChatMessageHistory
20
+ from langchain_community.chat_message_histories import ChatMessageHistory
21
+
22
+ loader = DirectoryLoader('./database', glob="./*.txt", loader_cls=TextLoader)
23
+
24
+ documents = loader.load()
25
+
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
27
+ texts = text_splitter.split_documents(documents)
28
+
29
+ persist_directory = 'db'
30
+
31
+ embedding = HuggingFaceEmbeddings()
32
+
33
+ vectordb = Chroma.from_documents(documents=texts,
34
+ embedding=embedding,
35
+ persist_directory=persist_directory)
36
+
37
+ vectordb.persist()
38
+ vectordb = None
39
+
40
+ vectordb = Chroma(persist_directory=persist_directory,
41
+ embedding_function=embedding)
42
+
43
+ def format_docs(docs):
44
+ return "\n\n".join(doc.page_content for doc in docs)
45
+
46
+ retriever = vectordb.as_retriever()
47
+ prompt = hub.pull("rlm/rag-prompt")
48
+ llm = HuggingFaceEndpoint(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1")
49
+ rag_chain = (
50
+ {"context": retriever | format_docs, "question": RunnablePassthrough()}
51
+ | prompt
52
+ | llm
53
+ | StrOutputParser()
54
+ )
55
+
56
+ contextualize_q_system_prompt = """Given a chat history and the latest user question \
57
+ which might reference context in the chat history, formulate a standalone question \
58
+ which can be understood without the chat history. Do NOT answer the question, \
59
+ just reformulate it if needed and otherwise return it as is."""
60
+ contextualize_q_prompt = ChatPromptTemplate.from_messages(
61
+ [
62
+ ("system", contextualize_q_system_prompt),
63
+ MessagesPlaceholder("chat_history"),
64
+ ("human", "{input}"),
65
+ ]
66
+ )
67
+ history_aware_retriever = create_history_aware_retriever(
68
+ llm, retriever, contextualize_q_prompt
69
+ )
70
+
71
+ qa_system_prompt = """You are an assistant for question-answering tasks. \
72
+ Use the following pieces of retrieved context to answer the question. \
73
+ If you don't know the answer, just say that you don't know. \
74
+ Use three sentences maximum and keep the answer concise.\
75
+
76
+ {context}"""
77
+ qa_prompt = ChatPromptTemplate.from_messages(
78
+ [
79
+ ("system", qa_system_prompt),
80
+ MessagesPlaceholder("chat_history"),
81
+ ("human", "{input}"),
82
+ ]
83
+ )
84
+
85
+ store = {}
86
+
87
+ def get_session_history(session_id: str) -> BaseChatMessageHistory:
88
+ if session_id not in store:
89
+ store[session_id] = ChatMessageHistory()
90
+ return store[session_id]
91
+
92
+ question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
93
+ rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
94
+
95
+ conversational_rag_chain = RunnableWithMessageHistory(
96
+ rag_chain,
97
+ get_session_history,
98
+ input_messages_key="input",
99
+ history_messages_key="chat_history",
100
+ output_messages_key="answer",
101
+ )
102
+
103
+ print("-------")
104
+ print("started")
105
+ print("-------")
106
+
107
+ async def echo(websocket):
108
+ async for message in websocket:
109
+ data = json.loads(message)
110
+ if not "message" in message:
111
+ return
112
+ if not "token" in message:
113
+ return
114
+ m = data["message"]
115
+ token = data["token"]
116
+ userData = json.load(open("userData.json", "w"))
117
+ docs = retriever.get_relevant_documents(m)
118
+ userData[token]["docs"] = str(docs)
119
+ response = conversational_rag_chain.invoke(
120
+ {"input": m},
121
+ config={
122
+ "configurable": {"session_id": token}
123
+ },
124
+ )["answer"]
125
+ await websocket.send(json.dumps({"response": response}))
126
+
127
+ async def main():
128
+ async with serve(echo, "0.0.0.0", 7860):
129
+ await asyncio.Future()
130
+
131
+ asyncio.run(main())
chs.json ADDED
The diff for this file is too large to render. See raw diff
 
crawler.py ADDED
File without changes
database.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93f27b61a3f0f03c0bdca772695ca92d99a4e037d0a7b2d08b71b0eb09cc33c9
3
+ size 253849
parse.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ # Configuration
5
+ name = "chs.json"
6
+ outputFolder = "database"
7
+ deleteKeys = [
8
+ "images",
9
+ "tags",
10
+ "html"
11
+ ]
12
+ typeScrape = {
13
+ "article": "text",
14
+ "event": "description",
15
+ "list": "items"
16
+ }
17
+
18
+ data = json.load(open(name, "r"))
19
+
20
+ i = -1
21
+ k = 0
22
+ try:
23
+ os.mkdir(outputFolder)
24
+ except: pass
25
+
26
+ for item in data:
27
+ i += 1
28
+ for key in deleteKeys:
29
+ if key in item:
30
+ item[key]
31
+ del item[key]
32
+ data[i] = item
33
+ if "type" in item:
34
+ for typeKey, scrapeText in typeScrape.items():
35
+ try:
36
+ if item["type"] == typeKey:
37
+ k += 1
38
+ file = open(f"{outputFolder}/chs-{typeKey}-{k}.txt", "a")
39
+ if item["type"] == "list":
40
+ text = ""
41
+ if "title" in item:
42
+ text = item["title"]
43
+ file.write(text)
44
+ for pair in item[scrapeText]:
45
+ text = ""
46
+ if "title" in pair:
47
+ text = "\n" + pair["title"]
48
+ if "summary" in pair:
49
+ if pair["summary"].replace(" ", "") != pair["title"].replace(" ", ""):
50
+ text += "\n" + pair["summary"].replace(pair["title"], "")
51
+ if "fsElementContent" in pair:
52
+ if pair["fsElementContent"].replace(" ", "") != pair["title"].replace(" ", ""):
53
+ text += "\n" + pair["fsElementContent"]
54
+ if "fsElementFooterContent" in pair:
55
+ if pair["fsElementFooterContent"].replace(" ", "") != pair["title"].replace(" ", ""):
56
+ text += "\n" + pair["fsElementFooterContent"]
57
+ if "fsElementHeaderContent" in pair:
58
+ if pair["fsElementHeaderContent"].replace(" ", "") != pair["title"].replace(" ", ""):
59
+ text += "\n" + pair["fsElementHeaderContent"]
60
+ if text != "":
61
+ file.write(text)
62
+ else:
63
+ text = item[scrapeText]
64
+ if text != "":
65
+ file.write(text)
66
+ except: pass
67
+ json.dump(data, open(name, "w"), indent = 6)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ websockets
2
+ langchain
3
+ langchain-community
4
+ huggingface_hub
5
+ tiktoken
6
+ chromadb
7
+ langchain-huggingface
8
+ accelerate