Spaces:

Nexialog
/

ESMA-GPT

Runtime error

App Files Files Community

Afritz commited on Dec 1, 2023

Commit

8f1e175

1 Parent(s): 17cae0c

Update utils.py

Browse files

Files changed (1) hide show

utils.py +168 -33

utils.py CHANGED Viewed

@@ -67,7 +67,7 @@ def make_html_source(paragraph, meta_doc, i):
     return f"""
 <div class="card" id="document-{i}">
     <div class="card-content">
-        <h2>Doc {i} - {meta_doc['short_name']} - Page {meta_paragraph['page_number']}</h2>
         <p>{content}</p>
     </div>
     <div class="card-footer">
@@ -79,6 +79,26 @@ def make_html_source(paragraph, meta_doc, i):
 </div>
 """
 def preprocess_message(text: str, docs_url: dict) -> str:
     return re.sub(
@@ -108,7 +128,6 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
 def chat(
     query: str,
     history: list,
-    query_mode : str = 'HYDE',
     threshold: float = CFG_APP.THRESHOLD,
     k_total: int = CFG_APP.K_TOTAL,
 ) -> tuple:
@@ -121,25 +140,16 @@ def chat(
     Yields:
         tuple: chat gradio format, chat openai format, sources used.
     """
-    if query_mode == 'Reformulation':
-        reformulated_query = openai.ChatCompletion.create(
-            model=CFG_APP.MODEL_NAME,
-            messages=get_reformulation_prompt(parse_glossary(query)),
-            temperature=0,
-            max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
-        )
-    else :
-        reformulated_query = openai.ChatCompletion.create(
-            model=CFG_APP.MODEL_NAME,
-            messages=get_hyde_prompt(parse_glossary(query)),
-            temperature=0,
-            max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
-        )
     reformulated_query = reformulated_query["choices"][0]["message"]["content"]
     if len(reformulated_query.split("\n")) == 2:
         reformulated_query, language = reformulated_query.split("\n")
         language = language.split(":")[1].strip()
@@ -152,21 +162,21 @@ def chat(
         k_total=k_total,
         threshold=threshold,
     )
     if CFG_APP.DEBUG == True:
         print("Scores : \n", scores)
     messages = history + [{"role": "user", "content": query}]
-    if query_mode is None or query_mode == 'HYDE' :
-        reformulated_query = reformulated_query.split("?")[0] + '?'
     docs_url = defaultdict(str)
     if len(sources) > 0:
         docs_string = []
         docs_html = []
         num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
         for i, data in enumerate(sources, 1):
             meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
@@ -176,17 +186,26 @@ def chat(
                 break
             num_tokens += num_tokens_doc
             docs_string.append(doc_content)
             docs_html.append(make_html_source(data, meta_doc, i))
             url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
             docs_url[i] = url_doc
-        docs_string = "\n\n".join(
-            [f"Query used for retrieval:\n{reformulated_query}"] + docs_string
-        )
-        docs_html = "\n\n".join(
-            [f"Query used for retrieval:\n{reformulated_query}"] + docs_html
-        )
         messages.append(
             {
                 "role": "system",
@@ -219,7 +238,7 @@ def chat(
                 {"role": "user", "content": reformulated_query},
                 {
                     "role": "system",
-                    "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:",
                 },
             ],
             temperature=0,  # deterministic
@@ -239,8 +258,124 @@ def chat(
                 yield gradio_format, messages, docs_html
     else:
-        docs_string = "⚠️ No relevant passages found in this report"
-        complete_response = "**⚠️ No relevant passages found in this report, you may want to ask a more specific question.**"
-        messages.append({"role": "assistant", "content": complete_response})
-        gradio_format = make_pairs([a["content"] for a in messages[1:]])
-        yield gradio_format, messages, docs_string

     return f"""
 <div class="card" id="document-{i}">
     <div class="card-content">
+        <h2>Excerpts {i} - Document {meta_doc['num_doc']} - Page {meta_paragraph['page_number']}</h2>
         <p>{content}</p>
     </div>
     <div class="card-footer">
 </div>
 """
+def make_citations_source(citation_dic, query, Hyde: False):
+    citation_list = [f'Doc {values[0]} - {keys} (excerpts {values[1]})' for keys, values in citation_dic.items()]
+    html_output = '<div class="source">\n'
+    html_output += '  <div class="title">Sources</div>\n'
+    if Hyde :
+        html_output += f'  <div>Query used for retrieval (with the HyDE technique after no response): {query}</div>\n'
+    else :
+        html_output += f'  <div>Query used for retrieval: {query}</div>\n'
+    html_output += '  <br>\n'
+    html_output += '  <ul>\n'
+    for row in citation_list :
+        html_output += f'<li>{row}</li>'
+    html_output += '  </ul>\n'
+    html_output += '</div>\n'
+    return html_output
 def preprocess_message(text: str, docs_url: dict) -> str:
     return re.sub(
 def chat(
     query: str,
     history: list,
     threshold: float = CFG_APP.THRESHOLD,
     k_total: int = CFG_APP.K_TOTAL,
 ) -> tuple:
     Yields:
         tuple: chat gradio format, chat openai format, sources used.
     """
+    reformulated_query = openai.ChatCompletion.create(
+        model=CFG_APP.MODEL_NAME,
+        messages=get_reformulation_prompt(parse_glossary(query)),
+        temperature=0,
+        max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
+    )
     reformulated_query = reformulated_query["choices"][0]["message"]["content"]
     if len(reformulated_query.split("\n")) == 2:
         reformulated_query, language = reformulated_query.split("\n")
         language = language.split(":")[1].strip()
         k_total=k_total,
         threshold=threshold,
     )
     if CFG_APP.DEBUG == True:
         print("Scores : \n", scores)
     messages = history + [{"role": "user", "content": query}]
     docs_url = defaultdict(str)
     if len(sources) > 0:
         docs_string = []
         docs_html = []
+        citations = {}
         num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
+        num_doc = 1
         for i, data in enumerate(sources, 1):
             meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
                 break
             num_tokens += num_tokens_doc
             docs_string.append(doc_content)
+            if meta_doc['short_name'] in citations.keys():
+                citations[meta_doc['short_name']][1] += f', {i}'
+            else :
+                citations[meta_doc['short_name']] = [num_doc, f'{i}']
+                num_doc += 1
+            meta_doc["num_doc"] = citations[meta_doc['short_name']][0]
             docs_html.append(make_html_source(data, meta_doc, i))
             url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
             docs_url[i] = url_doc
+        html_cit = [make_citations_source(citations, reformulated_query, Hyde=False)]
+        docs_string = "\n\n".join( [f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
+        docs_html = "\n\n".join(html_cit + docs_html)
         messages.append(
             {
                 "role": "system",
                 {"role": "user", "content": reformulated_query},
                 {
                     "role": "system",
+                    "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
                 },
             ],
             temperature=0,  # deterministic
                 yield gradio_format, messages, docs_html
     else:
+        reformulated_query = openai.ChatCompletion.create(
+            model=CFG_APP.MODEL_NAME,
+            messages=get_hyde_prompt(parse_glossary(query)),
+            temperature=0,
+            max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
+        )
+        reformulated_query = reformulated_query["choices"][0]["message"]["content"]
+        if len(reformulated_query.split("\n")) == 2:
+            reformulated_query, language = reformulated_query.split("\n")
+            language = language.split(":")[1].strip()
+        else:
+            reformulated_query = reformulated_query.split("\n")[0]
+            language = "English"
+        sources, scores = text_embedder.retrieve_faiss(
+            reformulated_query,
+            k_total=k_total,
+            threshold=threshold,
+        )
+        if CFG_APP.DEBUG == True:
+            print("Scores : \n", scores)
+        if len(sources) > 0 :
+            docs_string = []
+            docs_html = []
+            citations = {}
+            num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
+            num_doc = 1
+            for i, data in enumerate(sources, 1):
+                meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
+                doc_content = f"📃 Doc {i}: \n{data['content']}"
+                num_tokens_doc = num_tokens_from_string(doc_content, CFG_APP.MODEL_NAME)
+                if num_tokens + num_tokens_doc > CFG_APP.MAX_TOKENS_API:
+                    break
+                num_tokens += num_tokens_doc
+                docs_string.append(doc_content)
+                if meta_doc['short_name'] in citations.keys():
+                    citations[meta_doc['short_name']][1] += f', {i}'
+                else:
+                    citations[meta_doc['short_name']] = [num_doc, f'{i}']
+                    num_doc += 1
+                meta_doc["num_doc"] = citations[meta_doc['short_name']][0]
+                docs_html.append(make_html_source(data, meta_doc, i))
+                url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
+                docs_url[i] = url_doc
+            html_cit = [make_citations_source(citations, reformulated_query, Hyde=True)]
+            docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
+            docs_html = "\n\n".join(html_cit + docs_html)
+            messages.append(
+                {
+                    "role": "system",
+                    "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:",
+                }
+            )
+            if CFG_APP.DEBUG == True:
+                print(f" 👨‍💻 question asked by the user : {query}")
+                print(f" 🕛 time : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+                print(" 🔌 messages sent to the API :")
+                api_messages = [
+                    {"role": "system", "content": CFG_APP.INIT_PROMPT},
+                    {"role": "user", "content": reformulated_query},
+                    {
+                        "role": "system",
+                        "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
+                    },
+                ]
+                for message in api_messages:
+                    print(
+                        f"length : {len(message['content'])}, content : {message['content']}"
+                    )
+            response = openai.ChatCompletion.create(
+                model=CFG_APP.MODEL_NAME,
+                messages=[
+                    {"role": "system", "content": CFG_APP.INIT_PROMPT},
+                    {"role": "user", "content": reformulated_query},
+                    {
+                        "role": "system",
+                        "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
+                    },
+                ],
+                temperature=0,  # deterministic
+                stream=True,
+                max_tokens=CFG_APP.MAX_TOKENS_ANSWER,
+            )
+            complete_response = ""
+            messages.pop()
+            messages.append({"role": "assistant", "content": complete_response})
+            for chunk in response:
+                chunk_message = chunk["choices"][0]["delta"].get("content")
+                if chunk_message:
+                    complete_response += chunk_message
+                    complete_response = preprocess_message(complete_response, docs_url)
+                    messages[-1]["content"] = complete_response
+                    gradio_format = make_pairs([a["content"] for a in messages[1:]])
+                    yield gradio_format, messages, docs_html
+        else :
+            docs_string = "⚠️ No relevant passages found in this report"
+            complete_response = "**⚠️ No relevant passages found in this report, you may want to ask a more specific question.**"
+            messages.append({"role": "assistant", "content": complete_response})
+            gradio_format = make_pairs([a["content"] for a in messages[1:]])
+            yield gradio_format, messages, docs_string