Spaces:
Running
Running
Commit
·
9643fb1
1
Parent(s):
95656f5
Initial
Browse files- .gitignore +1 -0
- app.py +149 -0
- chain.py +140 -0
- requirements.txt +81 -0
- store/tok_doc_idx.json +0 -0
- store/tok_docs.json +0 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
env
|
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
from langchain.docstore.document import Document
|
6 |
+
from langchain.docstore.in_memory import InMemoryDocstore
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings
|
8 |
+
from langchain.vectorstores.faiss import FAISS
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
from chain import get_chain
|
13 |
+
|
14 |
+
STORE_DIR = "store"
|
15 |
+
YOUTUBE_EMBED_TEMPLATE = """
|
16 |
+
<iframe width="354" height="200" src="{source}" title="YouTube video player" frameborder="0"
|
17 |
+
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen>
|
18 |
+
</iframe>"""
|
19 |
+
|
20 |
+
|
21 |
+
def load_store():
|
22 |
+
def keys_to_int(x):
|
23 |
+
return {int(k): v for k, v in x.items()}
|
24 |
+
|
25 |
+
def _read_index(path):
|
26 |
+
import faiss
|
27 |
+
|
28 |
+
return faiss.read_index(str(path))
|
29 |
+
|
30 |
+
index_path = list(Path(STORE_DIR).glob("*.faiss"))
|
31 |
+
if len(index_path) == 0:
|
32 |
+
raise ValueError("No index found in path")
|
33 |
+
|
34 |
+
index_path = index_path[0]
|
35 |
+
index_name = index_path.name.split(".")[0]
|
36 |
+
|
37 |
+
with open(os.path.join(STORE_DIR, f"{index_name}_doc_idx.json"), "r") as f:
|
38 |
+
index_to_id = json.load(f, object_hook=keys_to_int)
|
39 |
+
|
40 |
+
with open(os.path.join(STORE_DIR, f"{index_name}_docs.json"), "r") as f:
|
41 |
+
docs = json.load(f)
|
42 |
+
|
43 |
+
embeddings = OpenAIEmbeddings()
|
44 |
+
return FAISS(
|
45 |
+
embedding_function=embeddings.embed_query,
|
46 |
+
index=_read_index(index_path),
|
47 |
+
docstore=InMemoryDocstore(
|
48 |
+
{index_to_id[i]: Document(**doc) for i, doc in enumerate(docs.values())}
|
49 |
+
),
|
50 |
+
index_to_docstore_id=index_to_id,
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
def set_openai_api_key(api_key, agent):
|
55 |
+
if api_key:
|
56 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
57 |
+
vstore = load_store()
|
58 |
+
qa_chain = get_chain(vstore)
|
59 |
+
os.environ["OPENAI_API_KEY"] = ""
|
60 |
+
return qa_chain
|
61 |
+
|
62 |
+
|
63 |
+
def _to_embed(link):
|
64 |
+
return link.replace("watch?v=", "embed/").replace("&t=", "?start=")
|
65 |
+
|
66 |
+
|
67 |
+
def chat(inp, history, agent):
|
68 |
+
history = history or []
|
69 |
+
if agent is None:
|
70 |
+
history.append((inp, "Please paste your OpenAI key to use"))
|
71 |
+
return history, history
|
72 |
+
output = agent({"question": inp, "chat_history": history})
|
73 |
+
answer = output["answer"]
|
74 |
+
history.append((inp, answer))
|
75 |
+
source_iframes = []
|
76 |
+
for source in output["sources"]:
|
77 |
+
if "youtube.com" in source:
|
78 |
+
source_iframes.append(
|
79 |
+
YOUTUBE_EMBED_TEMPLATE.format(source=_to_embed(source))
|
80 |
+
)
|
81 |
+
source_html = f"""<div style='min-height:200px;display:flex;align-items:center;justify-content:space-around;'>
|
82 |
+
{''.join(source_iframes)}
|
83 |
+
</div>"""
|
84 |
+
return history, history, source_html
|
85 |
+
|
86 |
+
|
87 |
+
block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
|
88 |
+
with block:
|
89 |
+
gr.Markdown("<h3><center>ToKBot🤖 - Ask ToKCast Questions</center></h3>")
|
90 |
+
openai_api_key_textbox = gr.Textbox(
|
91 |
+
placeholder="Paste your OpenAI API key (sk-...)",
|
92 |
+
show_label=False,
|
93 |
+
lines=1,
|
94 |
+
type="password",
|
95 |
+
)
|
96 |
+
|
97 |
+
chatbot = gr.Chatbot()
|
98 |
+
gr.Markdown("<h3>Excerpts</h3>")
|
99 |
+
sources = gr.HTML(
|
100 |
+
"""<div style="min-height:200px;display:flex;align-items:center;justify-content:center;">
|
101 |
+
<h3 style="text-align:center;color:#555;font-size:2rem;">No videos</h3>
|
102 |
+
</div>"""
|
103 |
+
)
|
104 |
+
with gr.Row():
|
105 |
+
message = gr.Textbox(
|
106 |
+
label="What's your question?",
|
107 |
+
placeholder="Type your question here...",
|
108 |
+
lines=1,
|
109 |
+
)
|
110 |
+
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
111 |
+
|
112 |
+
gr.Examples(
|
113 |
+
examples=[
|
114 |
+
"What is a beginning of infinity?",
|
115 |
+
"How do memes differ from genes in how they replicate?",
|
116 |
+
"What is the nature of knowledge and how does it grow?",
|
117 |
+
],
|
118 |
+
inputs=message,
|
119 |
+
)
|
120 |
+
|
121 |
+
gr.HTML(
|
122 |
+
"""A GPT-3/LangChain bot that answers questions about the TokCast podcast provides relevant video excerpts"""
|
123 |
+
)
|
124 |
+
|
125 |
+
gr.HTML(
|
126 |
+
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
127 |
+
)
|
128 |
+
|
129 |
+
state = gr.State()
|
130 |
+
agent_state = gr.State()
|
131 |
+
|
132 |
+
submit.click(
|
133 |
+
chat,
|
134 |
+
inputs=[message, state, agent_state],
|
135 |
+
outputs=[chatbot, state, sources],
|
136 |
+
)
|
137 |
+
message.submit(
|
138 |
+
chat,
|
139 |
+
inputs=[message, state, agent_state],
|
140 |
+
outputs=[chatbot, state, sources],
|
141 |
+
)
|
142 |
+
|
143 |
+
openai_api_key_textbox.change(
|
144 |
+
set_openai_api_key,
|
145 |
+
inputs=[openai_api_key_textbox, agent_state],
|
146 |
+
outputs=[agent_state],
|
147 |
+
)
|
148 |
+
|
149 |
+
block.launch(debug=True)
|
chain.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Tuple
|
2 |
+
|
3 |
+
from langchain import OpenAI, PromptTemplate
|
4 |
+
from langchain.chains import LLMChain
|
5 |
+
from langchain.chains.base import Chain
|
6 |
+
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
|
7 |
+
from langchain.chains.question_answering import load_qa_chain
|
8 |
+
from langchain.prompts import FewShotPromptTemplate
|
9 |
+
|
10 |
+
# from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
|
11 |
+
from langchain.vectorstores import FAISS
|
12 |
+
from pydantic import BaseModel
|
13 |
+
|
14 |
+
|
15 |
+
class CustomChain(Chain, BaseModel):
|
16 |
+
|
17 |
+
vstore: FAISS
|
18 |
+
chain: BaseCombineDocumentsChain
|
19 |
+
key_word_extractor: Chain
|
20 |
+
|
21 |
+
@property
|
22 |
+
def input_keys(self) -> List[str]:
|
23 |
+
return ["question"]
|
24 |
+
|
25 |
+
@property
|
26 |
+
def output_keys(self) -> List[str]:
|
27 |
+
return ["answer", "sources"]
|
28 |
+
|
29 |
+
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
|
30 |
+
question = inputs["question"]
|
31 |
+
chat_history_str = _get_chat_history(inputs["chat_history"])
|
32 |
+
if chat_history_str:
|
33 |
+
new_question = self.key_word_extractor.run(
|
34 |
+
question=question, chat_history=chat_history_str
|
35 |
+
)
|
36 |
+
|
37 |
+
else:
|
38 |
+
new_question = question
|
39 |
+
docs = self.vstore.similarity_search(new_question, k=3)
|
40 |
+
new_inputs = inputs.copy()
|
41 |
+
new_inputs["question"] = new_question
|
42 |
+
new_inputs["chat_history"] = chat_history_str
|
43 |
+
answer, _ = self.chain.combine_docs(docs, **new_inputs)
|
44 |
+
sources = ""
|
45 |
+
if "SOURCES:" in answer:
|
46 |
+
answer, sources = answer.split("SOURCES:")
|
47 |
+
sources = sources.split(", ")
|
48 |
+
answer = answer.strip()
|
49 |
+
return {"answer": answer, "sources": sources}
|
50 |
+
|
51 |
+
|
52 |
+
def get_chain(vectorstore: FAISS) -> Chain:
|
53 |
+
_eg_template = """## Example:
|
54 |
+
|
55 |
+
Chat History:
|
56 |
+
{chat_history}
|
57 |
+
Follow Up question: {question}
|
58 |
+
Standalone question: {answer}"""
|
59 |
+
_eg_prompt = PromptTemplate(
|
60 |
+
template=_eg_template,
|
61 |
+
input_variables=["chat_history", "question", "answer"],
|
62 |
+
)
|
63 |
+
|
64 |
+
_prefix = """Given the following Chat History and a Follow Up Question, rephrase the Follow Up Question to be a new Standalone Question that takes the Chat History and context in to consideration. You should assume that the question is related to the TokCast podcast."""
|
65 |
+
_suffix = """## Example:
|
66 |
+
|
67 |
+
Chat History:
|
68 |
+
{chat_history}
|
69 |
+
Follow Up Input: {question}
|
70 |
+
Standalone question:"""
|
71 |
+
# example_selector = SemanticSimilarityExampleSelector(
|
72 |
+
# vectorstore=vectorstore,
|
73 |
+
# k=4,
|
74 |
+
# )
|
75 |
+
|
76 |
+
examples = [
|
77 |
+
{
|
78 |
+
"question": "What is the TokCast podcast?",
|
79 |
+
"chat_history": [],
|
80 |
+
"answer": "TokCast is a podcast about the philosophy of David Deutsch.",
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"question": "Who is that?",
|
84 |
+
"chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.",
|
85 |
+
"answer": "Who is David Deutsch?",
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"question": "What is the worldview presented here?",
|
89 |
+
"chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.\nHuman: Who is that?\nAssistant: David Deutsch is a philosopher, physicist, and author. He is the author of The Beginning of Infinity, Fabric of Reality, and one of the pioneers of the field of quantum computing.",
|
90 |
+
"answer": "What is David Deutsch's worldview?",
|
91 |
+
},
|
92 |
+
]
|
93 |
+
prompt = FewShotPromptTemplate(
|
94 |
+
prefix=_prefix,
|
95 |
+
suffix=_suffix,
|
96 |
+
# example_selector=example_selector,
|
97 |
+
examples=examples,
|
98 |
+
example_prompt=_eg_prompt,
|
99 |
+
input_variables=["question", "chat_history"],
|
100 |
+
)
|
101 |
+
llm = OpenAI(temperature=0, model_name="text-davinci-003")
|
102 |
+
key_word_extractor = LLMChain(llm=llm, prompt=prompt, verbose=True)
|
103 |
+
|
104 |
+
EXAMPLE_PROMPT = PromptTemplate(
|
105 |
+
template="CONTENT:\n{page_content}\n----------\nSOURCE:\n{source}\n",
|
106 |
+
input_variables=["page_content", "source"],
|
107 |
+
)
|
108 |
+
template = """You are an AI assistant for the TokCast Podcast. You're trained on all the transcripts of the podcast.
|
109 |
+
Given a QUESTION and a series one or more CONTENT and SOURCE sections from a long document provide a conversational answer as "ANSWER" and a "SOURCES" output which lists verbatim the SOURCEs used in generating the response.
|
110 |
+
You should only use SOURCEs that are explicitly listed as a SOURCE in the context.
|
111 |
+
ALWAYS include the "SOURCES" as part of the response. If you don't have any sources, just say "SOURCES:"
|
112 |
+
If you don't know the answer, just say "I'm not sure. Check out Brett's Channel" Don't try to make up an answer.
|
113 |
+
QUESTION: {question}
|
114 |
+
=========
|
115 |
+
{context}
|
116 |
+
=========
|
117 |
+
ANSWER:"""
|
118 |
+
PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
|
119 |
+
doc_chain = load_qa_chain(
|
120 |
+
OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=-1),
|
121 |
+
chain_type="stuff",
|
122 |
+
prompt=PROMPT,
|
123 |
+
document_prompt=EXAMPLE_PROMPT,
|
124 |
+
verbose=True,
|
125 |
+
)
|
126 |
+
return CustomChain(
|
127 |
+
chain=doc_chain,
|
128 |
+
vstore=vectorstore,
|
129 |
+
key_word_extractor=key_word_extractor,
|
130 |
+
verbose=True,
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
def _get_chat_history(chat_history: List[Tuple[str, str]]):
|
135 |
+
buffer = ""
|
136 |
+
for human_s, ai_s in chat_history:
|
137 |
+
human = "Human: " + human_s
|
138 |
+
ai = "Assistant: " + ai_s
|
139 |
+
buffer += "\n" + "\n".join([human, ai])
|
140 |
+
return buffer
|
requirements.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==22.1.0
|
2 |
+
aiohttp==3.8.3
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.0
|
5 |
+
anyio==3.6.2
|
6 |
+
async-timeout==4.0.2
|
7 |
+
attrs==22.2.0
|
8 |
+
autopep8==2.0.1
|
9 |
+
black==22.12.0
|
10 |
+
blobfile==2.0.1
|
11 |
+
certifi==2022.12.7
|
12 |
+
charset-normalizer==2.1.1
|
13 |
+
click==8.1.3
|
14 |
+
contourpy==1.0.7
|
15 |
+
cycler==0.11.0
|
16 |
+
entrypoints==0.4
|
17 |
+
faiss-cpu==1.7.3
|
18 |
+
fastapi==0.89.1
|
19 |
+
ffmpy==0.3.0
|
20 |
+
filelock==3.9.0
|
21 |
+
flake8==6.0.0
|
22 |
+
fonttools==4.38.0
|
23 |
+
frozenlist==1.3.3
|
24 |
+
fsspec==2023.1.0
|
25 |
+
gradio==3.16.2
|
26 |
+
greenlet==2.0.1
|
27 |
+
h11==0.14.0
|
28 |
+
httpcore==0.16.3
|
29 |
+
httpx==0.23.3
|
30 |
+
idna==3.4
|
31 |
+
Jinja2==3.1.2
|
32 |
+
jsonschema==4.17.3
|
33 |
+
kiwisolver==1.4.4
|
34 |
+
langchain==0.0.65
|
35 |
+
linkify-it-py==1.0.3
|
36 |
+
lxml==4.9.2
|
37 |
+
markdown-it-py==2.1.0
|
38 |
+
MarkupSafe==2.1.2
|
39 |
+
matplotlib==3.6.3
|
40 |
+
mccabe==0.7.0
|
41 |
+
mdit-py-plugins==0.3.3
|
42 |
+
mdurl==0.1.2
|
43 |
+
multidict==6.0.4
|
44 |
+
mypy-extensions==0.4.3
|
45 |
+
numpy==1.24.1
|
46 |
+
openai==0.26.1
|
47 |
+
orjson==3.8.5
|
48 |
+
packaging==23.0
|
49 |
+
pandas==1.5.3
|
50 |
+
pathspec==0.10.3
|
51 |
+
Pillow==9.4.0
|
52 |
+
platformdirs==2.6.2
|
53 |
+
pycodestyle==2.10.0
|
54 |
+
pycryptodome==3.16.0
|
55 |
+
pycryptodomex==3.16.0
|
56 |
+
pydantic==1.10.4
|
57 |
+
pydub==0.25.1
|
58 |
+
pyflakes==3.0.1
|
59 |
+
pyparsing==3.0.9
|
60 |
+
pyrsistent==0.19.3
|
61 |
+
python-dateutil==2.8.2
|
62 |
+
python-multipart==0.0.5
|
63 |
+
pytz==2022.7.1
|
64 |
+
PyYAML==6.0
|
65 |
+
regex==2022.10.31
|
66 |
+
requests==2.28.2
|
67 |
+
rfc3986==1.5.0
|
68 |
+
six==1.16.0
|
69 |
+
sniffio==1.3.0
|
70 |
+
SQLAlchemy==1.4.46
|
71 |
+
starlette==0.22.0
|
72 |
+
tiktoken==0.1.2
|
73 |
+
tomli==2.0.1
|
74 |
+
toolz==0.12.0
|
75 |
+
tqdm==4.64.1
|
76 |
+
typing_extensions==4.4.0
|
77 |
+
uc-micro-py==1.0.1
|
78 |
+
urllib3==1.26.14
|
79 |
+
uvicorn==0.20.0
|
80 |
+
websockets==10.4
|
81 |
+
yarl==1.8.2
|
store/tok_doc_idx.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
store/tok_docs.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|