Spaces:
Running
Running
File size: 5,486 Bytes
9643fb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
from typing import Dict, List, Tuple
from langchain import OpenAI, PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.base import Chain
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import FewShotPromptTemplate
# from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.vectorstores import FAISS
from pydantic import BaseModel
class CustomChain(Chain, BaseModel):
vstore: FAISS
chain: BaseCombineDocumentsChain
key_word_extractor: Chain
@property
def input_keys(self) -> List[str]:
return ["question"]
@property
def output_keys(self) -> List[str]:
return ["answer", "sources"]
def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
question = inputs["question"]
chat_history_str = _get_chat_history(inputs["chat_history"])
if chat_history_str:
new_question = self.key_word_extractor.run(
question=question, chat_history=chat_history_str
)
else:
new_question = question
docs = self.vstore.similarity_search(new_question, k=3)
new_inputs = inputs.copy()
new_inputs["question"] = new_question
new_inputs["chat_history"] = chat_history_str
answer, _ = self.chain.combine_docs(docs, **new_inputs)
sources = ""
if "SOURCES:" in answer:
answer, sources = answer.split("SOURCES:")
sources = sources.split(", ")
answer = answer.strip()
return {"answer": answer, "sources": sources}
def get_chain(vectorstore: FAISS) -> Chain:
_eg_template = """## Example:
Chat History:
{chat_history}
Follow Up question: {question}
Standalone question: {answer}"""
_eg_prompt = PromptTemplate(
template=_eg_template,
input_variables=["chat_history", "question", "answer"],
)
_prefix = """Given the following Chat History and a Follow Up Question, rephrase the Follow Up Question to be a new Standalone Question that takes the Chat History and context in to consideration. You should assume that the question is related to the TokCast podcast."""
_suffix = """## Example:
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
# example_selector = SemanticSimilarityExampleSelector(
# vectorstore=vectorstore,
# k=4,
# )
examples = [
{
"question": "What is the TokCast podcast?",
"chat_history": [],
"answer": "TokCast is a podcast about the philosophy of David Deutsch.",
},
{
"question": "Who is that?",
"chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.",
"answer": "Who is David Deutsch?",
},
{
"question": "What is the worldview presented here?",
"chat_history": "Human: What is the TokCast podcast?\nAssistant: TokCast is a podcast about the philosophy of David Deutsch.\nHuman: Who is that?\nAssistant: David Deutsch is a philosopher, physicist, and author. He is the author of The Beginning of Infinity, Fabric of Reality, and one of the pioneers of the field of quantum computing.",
"answer": "What is David Deutsch's worldview?",
},
]
prompt = FewShotPromptTemplate(
prefix=_prefix,
suffix=_suffix,
# example_selector=example_selector,
examples=examples,
example_prompt=_eg_prompt,
input_variables=["question", "chat_history"],
)
llm = OpenAI(temperature=0, model_name="text-davinci-003")
key_word_extractor = LLMChain(llm=llm, prompt=prompt, verbose=True)
EXAMPLE_PROMPT = PromptTemplate(
template="CONTENT:\n{page_content}\n----------\nSOURCE:\n{source}\n",
input_variables=["page_content", "source"],
)
template = """You are an AI assistant for the TokCast Podcast. You're trained on all the transcripts of the podcast.
Given a QUESTION and a series one or more CONTENT and SOURCE sections from a long document provide a conversational answer as "ANSWER" and a "SOURCES" output which lists verbatim the SOURCEs used in generating the response.
You should only use SOURCEs that are explicitly listed as a SOURCE in the context.
ALWAYS include the "SOURCES" as part of the response. If you don't have any sources, just say "SOURCES:"
If you don't know the answer, just say "I'm not sure. Check out Brett's Channel" Don't try to make up an answer.
QUESTION: {question}
=========
{context}
=========
ANSWER:"""
PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
doc_chain = load_qa_chain(
OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=-1),
chain_type="stuff",
prompt=PROMPT,
document_prompt=EXAMPLE_PROMPT,
verbose=True,
)
return CustomChain(
chain=doc_chain,
vstore=vectorstore,
key_word_extractor=key_word_extractor,
verbose=True,
)
def _get_chat_history(chat_history: List[Tuple[str, str]]):
buffer = ""
for human_s, ai_s in chat_history:
human = "Human: " + human_s
ai = "Assistant: " + ai_s
buffer += "\n" + "\n".join([human, ai])
return buffer
|