import json from collections import defaultdict import openai import re from config import CFG_APP from text_embedder import SentenceTransformersTextEmbedder from datetime import datetime import tiktoken doc_metadata = json.load(open(CFG_APP.DOC_METADATA_PATH, "r")) # Embedding Model if "sentence-transformers" in CFG_APP.EMBEDDING_MODEL: text_embedder = SentenceTransformersTextEmbedder( model_name=CFG_APP.EMBEDDING_MODEL, paragraphs_path=CFG_APP.DATA_FOLDER, device=CFG_APP.DEVICE, load_existing_index=True, ) else: raise ValueError("Embedding model not found !") # Util Functions def retrieve_doc_metadata(doc_metadata, doc_id): for meta in doc_metadata: if meta["id"] == doc_id: return meta def get_reformulation_prompt(query: str) -> list: return [ { "role": "user", "content": f"""{CFG_APP.REFORMULATION_PROMPT} --- query: {query} standalone question: """, } ] def get_hyde_prompt(query: str) -> list: return [ { "role": "user", "content": f"""{CFG_APP.HYDE_PROMPT} --- query: {query} output: """, } ] def make_pairs(lst): """From a list of even lenght, make tupple pairs Args: lst (list): a list of even lenght Returns: list: the list as tupple pairs """ assert not (l := len(lst) % 2), f"your list is of lenght {l} which is not even" return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] def make_html_source(paragraph, meta_doc, i): content = paragraph["content"] meta_paragraph = paragraph["meta"] return f"""

Excerpts {i} - Document {meta_doc['num_doc']} - Page {meta_paragraph['page_number']}

{content}

""" def make_citations_source(citation_dic, query, Hyde: False): citation_list = [f'Doc {values[0]} - {keys} (excerpts {values[1]})' for keys, values in citation_dic.items()] html_output = '
\n' html_output += '
Sources
\n' if Hyde : html_output += f'
Query used for retrieval (with the HyDE technique after no response): {query}
\n' else : html_output += f'
Query used for retrieval: {query}
\n' html_output += '
\n' html_output += ' \n' html_output += '
\n' return html_output def preprocess_message(text: str, docs_url: dict) -> str: return re.sub( r"\[doc (\d+)\]", lambda match: f'{match.group(0)}', text, ) def parse_glossary(query): file = "glossary.json" glossary = json.load(open(file, "r")) words_query = query.split(" ") for i, word in enumerate(words_query): for key in glossary.keys(): if word.lower() == key.lower(): words_query[i] = words_query[i] + f" ({glossary[key]})" return " ".join(words_query) def num_tokens_from_string(string: str, encoding_name: str) -> int: encoding = tiktoken.encoding_for_model(encoding_name) num_tokens = len(encoding.encode(string)) return num_tokens def chat( query: str, history: list, threshold: float = CFG_APP.THRESHOLD, k_total: int = CFG_APP.K_TOTAL, ) -> tuple: """retrieve relevant documents in the document store then query gpt-turbo Args: query (str): user message. history (list, optional): history of the conversation. Defaults to [system_template]. report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available". threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56. Yields: tuple: chat gradio format, chat openai format, sources used. """ reformulated_query = openai.ChatCompletion.create( model=CFG_APP.MODEL_NAME, messages=get_reformulation_prompt(parse_glossary(query)), temperature=0, max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION, ) reformulated_query = reformulated_query["choices"][0]["message"]["content"] if len(reformulated_query.split("\n")) == 2: reformulated_query, language = reformulated_query.split("\n") language = language.split(":")[1].strip() else: reformulated_query = reformulated_query.split("\n")[0] language = "English" sources, scores = text_embedder.retrieve_faiss( reformulated_query, k_total=k_total, threshold=threshold, ) if CFG_APP.DEBUG == True: print("Scores : \n", scores) messages = history + [{"role": "user", "content": query}] docs_url = defaultdict(str) if len(sources) > 0: docs_string = [] docs_html = [] citations = {} num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME) num_doc = 1 for i, data in enumerate(sources, 1): meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"]) doc_content = f"📃 Doc {i}: \n{data['content']}" num_tokens_doc = num_tokens_from_string(doc_content, CFG_APP.MODEL_NAME) if num_tokens + num_tokens_doc > CFG_APP.MAX_TOKENS_API: break num_tokens += num_tokens_doc docs_string.append(doc_content) if meta_doc['short_name'] in citations.keys(): citations[meta_doc['short_name']][1] += f', {i}' else : citations[meta_doc['short_name']] = [num_doc, f'{i}'] num_doc += 1 meta_doc["num_doc"] = citations[meta_doc['short_name']][0] docs_html.append(make_html_source(data, meta_doc, i)) url_doc = f'' docs_url[i] = url_doc html_cit = [make_citations_source(citations, reformulated_query, Hyde=False)] docs_string = "\n\n".join( [f"Query used for retrieval:\n{reformulated_query}"] + docs_string) docs_html = "\n\n".join(html_cit + docs_html) messages.append( { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:", } ) if CFG_APP.DEBUG == True: print(f" 👨‍💻 question asked by the user : {query}") print(f" 🕛 time : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(" 🔌 messages sent to the API :") api_messages = [ {"role": "system", "content": CFG_APP.INIT_PROMPT}, {"role": "user", "content": reformulated_query}, { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:", }, ] for message in api_messages: print( f"length : {len(message['content'])}, content : {message['content']}" ) response = openai.ChatCompletion.create( model=CFG_APP.MODEL_NAME, messages=[ {"role": "system", "content": CFG_APP.INIT_PROMPT}, {"role": "user", "content": reformulated_query}, { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:", }, ], temperature=0, # deterministic stream=True, max_tokens=CFG_APP.MAX_TOKENS_ANSWER, ) complete_response = "" messages.pop() messages.append({"role": "assistant", "content": complete_response}) for chunk in response: chunk_message = chunk["choices"][0]["delta"].get("content") if chunk_message: complete_response += chunk_message complete_response = preprocess_message(complete_response, docs_url) messages[-1]["content"] = complete_response gradio_format = make_pairs([a["content"] for a in messages[1:]]) yield gradio_format, messages, docs_html else: reformulated_query = openai.ChatCompletion.create( model=CFG_APP.MODEL_NAME, messages=get_hyde_prompt(parse_glossary(query)), temperature=0, max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION, ) reformulated_query = reformulated_query["choices"][0]["message"]["content"] if len(reformulated_query.split("\n")) == 2: reformulated_query, language = reformulated_query.split("\n") language = language.split(":")[1].strip() else: reformulated_query = reformulated_query.split("\n")[0] language = "English" sources, scores = text_embedder.retrieve_faiss( reformulated_query, k_total=k_total, threshold=threshold, ) if CFG_APP.DEBUG == True: print("Scores : \n", scores) if len(sources) > 0 : docs_string = [] docs_html = [] citations = {} num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME) num_doc = 1 for i, data in enumerate(sources, 1): meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"]) doc_content = f"📃 Doc {i}: \n{data['content']}" num_tokens_doc = num_tokens_from_string(doc_content, CFG_APP.MODEL_NAME) if num_tokens + num_tokens_doc > CFG_APP.MAX_TOKENS_API: break num_tokens += num_tokens_doc docs_string.append(doc_content) if meta_doc['short_name'] in citations.keys(): citations[meta_doc['short_name']][1] += f', {i}' else: citations[meta_doc['short_name']] = [num_doc, f'{i}'] num_doc += 1 meta_doc["num_doc"] = citations[meta_doc['short_name']][0] docs_html.append(make_html_source(data, meta_doc, i)) url_doc = f'' docs_url[i] = url_doc html_cit = [make_citations_source(citations, reformulated_query, Hyde=True)] docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string) docs_html = "\n\n".join(html_cit + docs_html) messages.append( { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:", } ) if CFG_APP.DEBUG == True: print(f" 👨‍💻 question asked by the user : {query}") print(f" 🕛 time : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(" 🔌 messages sent to the API :") api_messages = [ {"role": "system", "content": CFG_APP.INIT_PROMPT}, {"role": "user", "content": reformulated_query}, { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:", }, ] for message in api_messages: print( f"length : {len(message['content'])}, content : {message['content']}" ) response = openai.ChatCompletion.create( model=CFG_APP.MODEL_NAME, messages=[ {"role": "system", "content": CFG_APP.INIT_PROMPT}, {"role": "user", "content": reformulated_query}, { "role": "system", "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:", }, ], temperature=0, # deterministic stream=True, max_tokens=CFG_APP.MAX_TOKENS_ANSWER, ) complete_response = "" messages.pop() messages.append({"role": "assistant", "content": complete_response}) for chunk in response: chunk_message = chunk["choices"][0]["delta"].get("content") if chunk_message: complete_response += chunk_message complete_response = preprocess_message(complete_response, docs_url) messages[-1]["content"] = complete_response gradio_format = make_pairs([a["content"] for a in messages[1:]]) yield gradio_format, messages, docs_html else : docs_string = "⚠️ No relevant passages found in this report" complete_response = "**⚠️ No relevant passages found in this report, you may want to ask a more specific question.**" messages.append({"role": "assistant", "content": complete_response}) gradio_format = make_pairs([a["content"] for a in messages[1:]]) yield gradio_format, messages, docs_string