Spaces:

Ekimetrics
/

climate-question-answering

Running

@@ -1,10 +1,10 @@
 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
-from climateqa.knowledge.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
-# reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 oa = OpenAlex()
 import gradio as gr
@@ -15,8 +15,6 @@ import time
 import re
 import json
-from gradio import ChatMessage
 # from gradio_modal import Modal
 from io import BytesIO
@@ -31,19 +29,16 @@ from utils import create_user_id
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
-from climateqa.knowledge.retriever import ClimateQARetriever
-from climateqa.engine.reranker import get_reranker
 from climateqa.engine.embeddings import get_embeddings_function
-from climateqa.engine.chains.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
-# from climateqa.engine.chains.answer_rag import make_rag_papers_chain
-from climateqa.engine.graph import make_graph_agent,display_graph
-from front.utils import make_html_source,parse_output_llm_with_sources,serialize_docs,make_toolbox
 # Load environment variables in local mode
 try:
@@ -86,21 +81,48 @@ user_id = create_user_id()
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("nano")
-agent = make_graph_agent(llm,vectorstore,reranker)
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
-    date_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    print(f">> NEW QUESTION ({date_now}) : {query}")
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
@@ -115,79 +137,77 @@ async def chat(query,history,audience,sources,reports):
     if len(sources) == 0:
         sources = ["IPCC"]
-    # if len(reports) == 0: # TODO
-    reports = []
-    inputs = {"user_input": query,"audience": audience_prompt,"sources":sources}
-    result = agent.astream_events(inputs,version = "v1")
-    # path_reformulation = "/logs/reformulation/final_output"
-    # path_keywords = "/logs/keywords/final_output"
-    # path_retriever = "/logs/find_documents/final_output"
-    # path_answer = "/logs/answer/streamed_output_str/-"
-    docs = []
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
-    start_streaming = False
-    steps_display = {
-        "categorize_intent":("🔄️ Analyzing user message",True),
-        "transform_query":("🔄️ Thinking step by step to answer the question",True),
-        "retrieve_documents":("🔄️ Searching in the knowledge base",False),
-    }
-    used_documents = []
-    answer_message_content = ""
     try:
-        async for event in result:
-            if "langgraph_node" in event["metadata"]:
-                node = event["metadata"]["langgraph_node"]
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
-                    try:
-                        docs = event["data"]["output"]["documents"]
-                        docs_html = []
-                        for i, d in enumerate(docs, 1):
-                            docs_html.append(make_html_source(d, i))
-                        used_documents = used_documents + [d.metadata["name"] for d in docs]
-                        history[-1].content = "Adding sources :\n\n - " + "\n - ".join(np.unique(used_documents))
-                        docs_html = "".join(docs_html)
-                    except Exception as e:
-                        print(f"Error getting documents: {e}")
-                        print(event)
-                elif event["name"] in steps_display.keys() and event["event"] == "on_chain_start": #display steps
-                    event_description,display_output = steps_display[node]
-                    if not hasattr(history[-1], 'metadata') or history[-1].metadata["title"] != event_description: # if a new step begins
-                        history.append(ChatMessage(role="assistant", content = "", metadata={'title' :event_description}))
-                elif event["name"] != "transform_query" and event["event"] == "on_chat_model_stream" and node in ["answer_rag", "answer_search"]:# if streaming answer
-                    if start_streaming == False:
-                        start_streaming = True
-                        history.append(ChatMessage(role="assistant", content = ""))
-                    answer_message_content +=  event["data"]["chunk"].content
-                    answer_message_content = parse_output_llm_with_sources(answer_message_content)
-                    history[-1] = ChatMessage(role="assistant", content = answer_message_content)
-                    # history.append(ChatMessage(role="assistant", content = new_message_content))
-                if event["name"] == "transform_query" and event["event"] =="on_chain_end":
-                    if hasattr(history[-1],"content"):
-                        history[-1].content += "Decompose question into sub-questions: \n\n - " + "\n - ".join([q["question"] for q in event["data"]["output"]["remaining_questions"]])
-                if event["name"] == "categorize_intent" and event["event"] == "on_chain_start":
-                    print("X")
-            yield history,docs_html,output_query,output_language,gallery #,output_query,output_keywords
     except Exception as e:
-        print(event, "has failed")
         raise gr.Error(f"{e}")
@@ -196,7 +216,7 @@ async def chat(query,history,audience,sources,reports):
         if os.getenv("GRADIO_ENV") != "local":
             timestamp = str(datetime.now().timestamp())
             file = timestamp + ".json"
-            prompt = history[1]["content"]
             logs = {
                 "user_id": str(user_id),
                 "prompt": prompt,
@@ -204,7 +224,7 @@ async def chat(query,history,audience,sources,reports):
                 "question":output_query,
                 "sources":sources,
                 "docs":serialize_docs(docs),
-                "answer": history[-1].content,
                 "time": timestamp,
             }
             log_on_azure(file, logs, share_client)
@@ -232,24 +252,99 @@ async def chat(query,history,audience,sources,reports):
             except Exception as e:
                 print(f"Skipped adding image {i} because of {e}")
-    ## temp removing
-    # if len(image_dict) > 0:
-    #     gallery = [x["img"] for x in list(image_dict.values())]
-    #     img = list(image_dict.values())[0]
-    #     img_md = img["md"]
-    #     img_caption = img["caption"]
-    #     img_code = img["figure_code"]
-    #     if img_code != "N/A":
-    #         img_name = f"{img['key']} - {img['figure_code']}"
-    #     else:
-    #         img_name = f"{img['key']}"
-    #     answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
-    #     history[-1] = (history[-1][0],answer_yet)
-    #     history = [tuple(x) for x in history]
-    # yield history,docs_html,output_query,output_language,gallery#,output_query,output_keywords
 def save_feedback(feed: str, user_id):
@@ -295,6 +390,56 @@ papers_cols_widths = {
 papers_cols = list(papers_cols_widths.keys())
 papers_cols_widths = list(papers_cols_widths.values())
 # --------------------------------------------------------------------
 # Gradio
@@ -324,21 +469,19 @@ def vote(data: gr.LikeData):
-with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=theme,elem_id = "main-component") as demo:
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
-                    value = [ChatMessage(role="assistant", content=init_prompt)],
-                    type = "messages",
-                    show_copy_button=True,
-                    show_label = False,
-                    elem_id="chatbot",
-                    layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
-                )
                 # bot.like(vote,None,None)
@@ -346,7 +489,8 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                 with gr.Row(elem_id = "input-message"):
                     textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
             with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
@@ -416,6 +560,9 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
@@ -424,25 +571,25 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
-    # with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
-    #     with gr.Row():
-    #         with gr.Column(scale=1):
-    #             query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
-    #             keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
-    #             after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
-    #             search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
-    #         with gr.Column(scale=7):
-    #             with gr.Tab("Summary",elem_id="papers-summary-tab"):
-    #                 papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
-    #             with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
-    #                 papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
-    #             with gr.Tab("Citations network",elem_id="papers-network-tab"):
-    #                 citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
@@ -453,9 +600,8 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     def start_chat(query,history):
-        # history = history + [(query,None)]
-        # history = [tuple(x) for x in history]
-        history = history + [ChatMessage(role="user", content=query)]
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
@@ -463,13 +609,13 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
@@ -484,7 +630,48 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
     demo.queue()
-demo.launch(ssr_mode=False)

 from climateqa.engine.embeddings import get_embeddings_function
 embeddings_function = get_embeddings_function()
+from climateqa.papers.openalex import OpenAlex
 from sentence_transformers import CrossEncoder
+reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
 oa = OpenAlex()
 import gradio as gr
 import re
 import json
 # from gradio_modal import Modal
 from io import BytesIO
 # ClimateQ&A imports
 from climateqa.engine.llm import get_llm
+from climateqa.engine.rag import make_rag_chain
 from climateqa.engine.vectorstore import get_pinecone_vectorstore
+from climateqa.engine.retriever import ClimateQARetriever
 from climateqa.engine.embeddings import get_embeddings_function
+from climateqa.engine.prompts import audience_prompts
 from climateqa.sample_questions import QUESTIONS
 from climateqa.constants import POSSIBLE_REPORTS
 from climateqa.utils import get_image_from_azure_blob_storage
 from climateqa.engine.keywords import make_keywords_chain
+from climateqa.engine.rag import make_rag_papers_chain
 # Load environment variables in local mode
 try:
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
+            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+def make_pairs(lst):
+    """from a list of even lenght, make tupple pairs"""
+    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
+def serialize_docs(docs):
+    new_docs = []
+    for doc in docs:
+        new_doc = {}
+        new_doc["page_content"] = doc.page_content
+        new_doc["metadata"] = doc.metadata
+        new_docs.append(new_doc)
+    return new_docs
 async def chat(query,history,audience,sources,reports):
     """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
     (messages in gradio format, messages in langchain format, source documents)"""
+    print(f">> NEW QUESTION : {query}")
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
     if len(sources) == 0:
         sources = ["IPCC"]
+    if len(reports) == 0:
+        reports = []
+    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,min_size = 200,reports = reports,k_summary = 3,k_total = 15,threshold=0.5)
+    rag_chain = make_rag_chain(retriever,llm)
+    inputs = {"query": query,"audience": audience_prompt}
+    result = rag_chain.astream_log(inputs) #{"callbacks":[MyCustomAsyncHandler()]})
+    # result = rag_chain.stream(inputs)
+    path_reformulation = "/logs/reformulation/final_output"
+    path_keywords = "/logs/keywords/final_output"
+    path_retriever = "/logs/find_documents/final_output"
+    path_answer = "/logs/answer/streamed_output_str/-"
     docs_html = ""
     output_query = ""
     output_language = ""
     output_keywords = ""
     gallery = []
     try:
+        async for op in result:
+            op = op.ops[0]
+            if op['path'] == path_reformulation: # reforulated question
+                try:
+                    output_language = op['value']["language"] # str
+                    output_query = op["value"]["question"]
+                except Exception as e:
+                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
+            if op["path"] == path_keywords:
+                try:
+                    output_keywords = op['value']["keywords"] # str
+                    output_keywords = " AND ".join(output_keywords)
+                except Exception as e:
+                    pass
+            elif op['path'] == path_retriever: # documents
+                try:
+                    docs = op['value']['docs'] # List[Document]
+                    docs_html = []
+                    for i, d in enumerate(docs, 1):
+                        docs_html.append(make_html_source(d, i))
+                    docs_html = "".join(docs_html)
+                except TypeError:
+                    print("No documents found")
+                    print("op: ",op)
+                    continue
+            elif op['path'] == path_answer: # final answer
+                new_token = op['value'] # str
+                # time.sleep(0.01)
+                previous_answer = history[-1][1]
+                previous_answer = previous_answer if previous_answer is not None else ""
+                answer_yet = previous_answer + new_token
+                answer_yet = parse_output_llm_with_sources(answer_yet)
+                history[-1] = (query,answer_yet)
+            else:
+                continue
+            history = [tuple(x) for x in history]
+            yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
     except Exception as e:
         raise gr.Error(f"{e}")
         if os.getenv("GRADIO_ENV") != "local":
             timestamp = str(datetime.now().timestamp())
             file = timestamp + ".json"
+            prompt = history[-1][0]
             logs = {
                 "user_id": str(user_id),
                 "prompt": prompt,
                 "question":output_query,
                 "sources":sources,
                 "docs":serialize_docs(docs),
+                "answer": history[-1][1],
                 "time": timestamp,
             }
             log_on_azure(file, logs, share_client)
             except Exception as e:
                 print(f"Skipped adding image {i} because of {e}")
+    if len(image_dict) > 0:
+        gallery = [x["img"] for x in list(image_dict.values())]
+        img = list(image_dict.values())[0]
+        img_md = img["md"]
+        img_caption = img["caption"]
+        img_code = img["figure_code"]
+        if img_code != "N/A":
+            img_name = f"{img['key']} - {img['figure_code']}"
+        else:
+            img_name = f"{img['key']}"
+        answer_yet = history[-1][1] + f"\n\n{img_md}\n<p class='chatbot-caption'><b>{img_name}</b> - {img_caption}</p>"
+        history[-1] = (history[-1][0],answer_yet)
+        history = [tuple(x) for x in history]
+    # gallery = [x.metadata["image_path"] for x in docs if (len(x.metadata["image_path"]) > 0 and "IAS" in x.metadata["image_path"])]
+    # if len(gallery) > 0:
+    #     gallery = list(set("|".join(gallery).split("|")))
+    #     gallery = [get_image_from_azure_blob_storage(x) for x in gallery]
+    yield history,docs_html,output_query,output_language,gallery,output_query,output_keywords
+def make_html_source(source,i):
+    meta = source.metadata
+    # content = source.page_content.split(":",1)[1].strip()
+    content = source.page_content.strip()
+    toc_levels = []
+    for j in range(2):
+        level = meta[f"toc_level{j}"]
+        if level != "N/A":
+            toc_levels.append(level)
+        else:
+            break
+    toc_levels = " > ".join(toc_levels)
+    if len(toc_levels) > 0:
+        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
+    else:
+        name = meta['name']
+    if meta["chunk_type"] == "text":
+        card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    else:
+        if meta["figure_code"] != "N/A":
+            title = f"{meta['figure_code']} - {meta['short_name']}"
+        else:
+            title = f"{meta['short_name']}"
+        card = f"""
+    <div class="card card-image">
+        <div class="card-content">
+            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
+            <p>{content}</p>
+            <p class='ai-generated'>AI-generated description</p>
+        </div>
+        <div class="card-footer">
+            <span>{name}</span>
+            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
+                <span role="img" aria-label="Open PDF">🔗</span>
+            </a>
+        </div>
+    </div>
+    """
+    return card
+#     else:
+#         docs_string = "No relevant passages found in the climate science reports (IPCC and IPBES)"
+#         complete_response = "**No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
+#         messages.append({"role": "assistant", "content": complete_response})
+#         gradio_format = make_pairs([a["content"] for a in messages[1:]])
+#         yield gradio_format, messages, docs_string
 def save_feedback(feed: str, user_id):
 papers_cols = list(papers_cols_widths.keys())
 papers_cols_widths = list(papers_cols_widths.values())
+async def find_papers(query, keywords,after):
+    summary = ""
+    df_works = oa.search(keywords,after = after)
+    df_works = df_works.dropna(subset=["abstract"])
+    df_works = oa.rerank(query,df_works,reranker)
+    df_works = df_works.sort_values("rerank_score",ascending=False)
+    G = oa.make_network(df_works)
+    height = "750px"
+    network = oa.show_network(G,color_by = "rerank_score",notebook=False,height = height)
+    network_html = network.generate_html()
+    network_html = network_html.replace("'", "\"")
+    css_to_inject = "<style>#mynetwork { border: none !important; } .card { border: none !important; }</style>"
+    network_html = network_html + css_to_inject
+    network_html = f"""<iframe style="width: 100%; height: {height};margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{network_html}'></iframe>"""
+    docs = df_works["content"].head(15).tolist()
+    df_works = df_works.reset_index(drop = True).reset_index().rename(columns = {"index":"doc"})
+    df_works["doc"] = df_works["doc"] + 1
+    df_works = df_works[papers_cols]
+    yield df_works,network_html,summary
+    chain = make_rag_papers_chain(llm)
+    result = chain.astream_log({"question": query,"docs": docs,"language":"English"})
+    path_answer = "/logs/StrOutputParser/streamed_output/-"
+    async for op in result:
+        op = op.ops[0]
+        if op['path'] == path_answer: # reforulated question
+            new_token = op['value'] # str
+            summary += new_token
+        else:
+            continue
+        yield df_works,network_html,summary
 # --------------------------------------------------------------------
 # Gradio
+with gr.Blocks(title="Climate Q&A", css="style.css", theme=theme,elem_id = "main-component") as demo:
+    # user_id_state = gr.State([user_id])
     with gr.Tab("ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
+                # state = gr.State([system_template])
                 chatbot = gr.Chatbot(
+                    value=[(None,init_prompt)],
+                    show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",
                     avatar_images = (None,"https://i.ibb.co/YNyd5W2/logo4.png"),
+                )#,avatar_images = ("assets/logo4.png",None))
                 # bot.like(vote,None,None)
                 with gr.Row(elem_id = "input-message"):
                     textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
+                    # submit = gr.Button("",elem_id = "submit-button",scale = 1,interactive = True,icon = "https://static-00.iconduck.com/assets.00/settings-icon-2048x2046-cw28eevx.png")
             with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
 #---------------------------------------------------------------------------------------
 # OTHER TABS
 #---------------------------------------------------------------------------------------
     with gr.Tab("Figures",elem_id = "tab-images",elem_classes = "max-height other-tabs"):
         gallery_component = gr.Gallery()
+    with gr.Tab("Papers (beta)",elem_id = "tab-papers",elem_classes = "max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                query_papers = gr.Textbox(placeholder="Question",show_label=False,lines = 1,interactive = True,elem_id="query-papers")
+                keywords_papers = gr.Textbox(placeholder="Keywords",show_label=False,lines = 1,interactive = True,elem_id="keywords-papers")
+                after = gr.Slider(minimum=1950,maximum=2023,step=1,value=1960,label="Publication date",show_label=True,interactive=True,elem_id="date-papers")
+                search_papers = gr.Button("Search",elem_id="search-papers",interactive=True)
+            with gr.Column(scale=7):
+                with gr.Tab("Summary",elem_id="papers-summary-tab"):
+                    papers_summary = gr.Markdown(visible=True,elem_id="papers-summary")
+                with gr.Tab("Relevant papers",elem_id="papers-results-tab"):
+                    papers_dataframe = gr.Dataframe(visible=True,elem_id="papers-table",headers = papers_cols)
+                with gr.Tab("Citations network",elem_id="papers-network-tab"):
+                    citations_network = gr.HTML(visible=True,elem_id="papers-citations-network")
     def start_chat(query,history):
+        history = history + [(query,None)]
+        history = [tuple(x) for x in history]
         return (gr.update(interactive = False),gr.update(selected=1),history)
     def finish_chat():
     (textbox
         .submit(start_chat, [textbox,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
+        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports], [chatbot,sources_textbox,output_query,output_language,gallery_component,query_papers,keywords_papers],concurrency_limit = 8,api_name = "chat_examples")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
     )
     dropdown_samples.change(change_sample_questions,dropdown_samples,samples)
+    query_papers.submit(generate_keywords,[query_papers], [keywords_papers])
+    search_papers.click(find_papers,[query_papers,keywords_papers,after], [papers_dataframe,citations_network,papers_summary])
+    # # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
+    # (textbox
+    #     .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # (examples_hidden
+    #     .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
+    #     .success(change_tab,None,tabs)
+    #     .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
+    #     .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
+    #     .success(lambda x : textbox,[textbox],[textbox])
+    # )
+    # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
+    #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
+    #     )
+    # with Modal(visible=True) as first_modal:
+    #     gr.Markdown("# Welcome to ClimateQ&A !")
+    #     gr.Markdown("### Examples")
+    #     examples = gr.Examples(
+    #         ["Yo ça roule","ça boume"],
+    #         [examples_hidden],
+    #         examples_per_page=8,
+    #         run_on_click=False,
+    #         elem_id="examples",
+    #         api_name="examples",
+    #     )
+    # submit.click(lambda: Modal(visible=True), None, config_modal)
     demo.queue()
+demo.launch()

climateqa/engine/chains/__init__.py DELETED Viewed

File without changes

climateqa/engine/chains/answer_ai_impact.py DELETED Viewed

@@ -1,46 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- Explain that the environmental impact of AI is not covered by the IPCC or IPBES reports, but you can recommend info based on the sources below
-- Answer the question in the original language of the question
-## Sources
-- You can propose to visit this page https://climateqa.com/docs/carbon-footprint/ to learn more about ClimateQ&A's own carbon footprint
-- You can recommend to look at the work of the AI & climate expert scientist Sasha Luccioni with in in particular those papers
-    - Power Hungry Processing: Watts Driving the Cost of AI Deployment? - https://arxiv.org/abs/2311.16863 - about the carbon footprint at the inference stage of AI models
-    - Counting Carbon: A Survey of Factors Influencing the Emissions of Machine Learning - https://arxiv.org/abs/2302.08476
-    - Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model - https://arxiv.org/abs/2211.02001 - about the carbon footprint of training a large language model
-- You can also recommend the following tools to calculate the carbon footprint of AI models
-    - CodeCarbon - https://github.com/mlco2/codecarbon to measure the carbon footprint of your code
-    - Ecologits - https://ecologits.ai/ to measure the carbon footprint of using LLMs APIs such
-"""
-def make_ai_impact_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"ai_impact_chain"})
-    return chain
-def make_ai_impact_node(llm):
-    ai_impact_chain = make_ai_impact_chain(llm)
-    async def answer_ai_impact(state,config):
-        answer = await ai_impact_chain.ainvoke({"question":state["user_input"]},config)
-        return {"answer":answer}
-    return answer_ai_impact

climateqa/engine/chains/answer_chitchat.py DELETED Viewed

@@ -1,52 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-chitchat_prompt_template = """
-You are ClimateQ&A, an helpful AI Assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports.
-Always stay true to climate and nature science and do not make up information.
-If you do not know the answer, just say you do not know.
-## Guidelines
-- If it's a conversational question, you can normally chat with the user
-- If the question is not related to any topic about the environment, refuse to answer and politely ask the user to ask another question about the environment
-- If the user ask if you speak any language, you can say you speak all languages :)
-- If the user ask about the bot itself "ClimateQ&A", you can explain that you are an AI assistant specialized in answering climate-related questions using info from the IPCC and/or IPBES reports and propose to visit the website here https://climateqa.com/docs/intro/ for more information
-- If the question is about ESG regulations, standards, or frameworks like the CSRD, TCFD, SASB, GRI, CDP, etc., you can explain that this is not a topic covered by the IPCC or IPBES reports.
-- Precise that you are specialized in finding trustworthy information from the scientific reports of the IPCC and IPBES and other scientific litterature
-- If relevant you can propose up to 3 example of questions they could ask from the IPCC or IPBES reports from the examples below
-- Always answer in the original language of the question
-## Examples of questions you can suggest (in the original language of the question)
-    "What evidence do we have of climate change?",
-    "Are human activities causing global warming?",
-    "What are the impacts of climate change?",
-    "Can climate change be reversed?",
-    "What is the difference between climate change and global warming?",
-"""
-def make_chitchat_chain(llm):
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", chitchat_prompt_template),
-        ("user", "{question}")
-    ])
-    chain = prompt | llm | StrOutputParser()
-    chain = chain.with_config({"run_name":"chitchat_chain"})
-    return chain
-def make_chitchat_node(llm):
-    chitchat_chain = make_chitchat_chain(llm)
-    async def answer_chitchat(state,config):
-        answer = await chitchat_chain.ainvoke({"question":state["user_input"]},config)
-        return {"answer":answer}
-    return answer_chitchat

climateqa/engine/chains/answer_rag.py DELETED Viewed

@@ -1,99 +0,0 @@
-from operator import itemgetter
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts.prompt import PromptTemplate
-from langchain_core.prompts.base import format_document
-from climateqa.engine.chains.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
-from climateqa.engine.chains.prompts import papers_prompt_template
-DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
-def _combine_documents(
-    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
-):
-    doc_strings =  []
-    for i,doc in enumerate(docs):
-        # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
-        chunk_type = "Doc"
-        if isinstance(doc,str):
-            doc_formatted = doc
-        else:
-            doc_formatted = format_document(doc, document_prompt)
-        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
-        doc_string = doc_string.replace("\n"," ")
-        doc_strings.append(doc_string)
-    return sep.join(doc_strings)
-def get_text_docs(x):
-    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
-def get_image_docs(x):
-    return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
-def make_rag_chain(llm):
-    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
-    chain = ({
-        "context":lambda x : _combine_documents(x["documents"]),
-        "query":itemgetter("query"),
-        "language":itemgetter("language"),
-        "audience":itemgetter("audience"),
-    } | prompt | llm | StrOutputParser())
-    return chain
-def make_rag_chain_without_docs(llm):
-    prompt = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
-    chain = prompt | llm | StrOutputParser()
-    return chain
-def make_rag_node(llm,with_docs = True):
-    if with_docs:
-        rag_chain = make_rag_chain(llm)
-    else:
-        rag_chain = make_rag_chain_without_docs(llm)
-    async def answer_rag(state,config):
-        answer = await rag_chain.ainvoke(state,config)
-        return {"answer":answer}
-    return answer_rag
-# def make_rag_papers_chain(llm):
-#     prompt = ChatPromptTemplate.from_template(papers_prompt_template)
-#     input_documents = {
-#         "context":lambda x : _combine_documents(x["docs"]),
-#         **pass_values(["question","language"])
-#     }
-#     chain = input_documents | prompt | llm | StrOutputParser()
-#     chain = rename_chain(chain,"answer")
-#     return chain
-# def make_illustration_chain(llm):
-#     prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
-#     input_description_images = {
-#         "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
-#         **pass_values(["question","audience","language","answer"]),
-#     }
-#     illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
-#     return illustration_chain

climateqa/engine/chains/intent_categorization.py DELETED Viewed

@@ -1,86 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class IntentCategorizer(BaseModel):
-    """Analyzing the user message input"""
-    language: str = Field(
-        description="Find the language of the message input in full words (ex: French, English, Spanish, ...), defaults to English",
-        default="English",
-    )
-    intent: str = Field(
-        enum=[
-            "ai_impact",
-            "geo_info",
-            "esg",
-            "search",
-            "chitchat",
-        ],
-        description="""
-            Categorize the user input in one of the following category
-            Any question
-            Examples:
-            - ai_impact = Environmental impacts of AI: "What are the environmental impacts of AI", "How does AI affect the environment"
-            - geo_info = Geolocated info about climate change: Any question where the user wants to know localized impacts of climate change, eg: "What will be the temperature in Marseille in 2050"
-            - esg = Any question about the ESG regulation, frameworks and standards like the CSRD, TCFD, SASB, GRI, CDP, etc.
-            - search = Searching for any quesiton about climate change, energy, biodiversity, nature, and everything we can find the IPCC or IPBES reports or scientific papers,
-            - chitchat = Any general question that is not related to the environment or climate change or just conversational, or if you don't think searching the IPCC or IPBES reports would be relevant
-        """,
-    )
-def make_intent_categorization_chain(llm):
-    openai_functions = [convert_to_openai_function(IntentCategorizer)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"IntentCategorizer"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_intent_categorization_node(llm):
-    categorization_chain = make_intent_categorization_chain(llm)
-    def categorize_message(state):
-        output = categorization_chain.invoke({"input":state["user_input"]})
-        if "language" not in output: output["language"] = "English"
-        output["query"] = state["user_input"]
-        return output
-    return categorize_message
-# SAMPLE_QUESTIONS = [
-#     "Est-ce que l'IA a un impact sur l'environnement ?",
-#     "Que dit le GIEC sur l'impact de l'IA",
-#     "Qui sont les membres du GIEC",
-#     "What is the impact of El Nino ?",
-#     "Yo",
-#     "Hello ça va bien ?",
-#     "Par qui as tu été créé ?",
-#     "What role do cloud formations play in modulating the Earth's radiative balance, and how are they represented in current climate models?",
-#     "Which industries have the highest GHG emissions?",
-#     "What are invasive alien species and how do they threaten biodiversity and ecosystems?",
-#     "Are human activities causing global warming?",
-#     "What is the motivation behind mining the deep seabed?",
-#     "Tu peux m'écrire un poème sur le changement climatique ?",
-#     "Tu peux m'écrire un poème sur les bonbons ?",
-#     "What will be the temperature in 2100 in Strasbourg?",
-#     "C'est quoi le lien entre biodiversity and changement climatique ?",
-# ]

climateqa/engine/chains/keywords_extraction.py DELETED Viewed

@@ -1,40 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class KeywordExtraction(BaseModel):
-    """
-    Analyzing the user query to extract keywords to feed a search engine
-    """
-    keywords: List[str] = Field(
-        description="""
-        Extract the keywords from the user query to feed a search engine as a list
-        Avoid adding super specific keywords to prefer general keywords
-        Maximum 3 keywords
-        Examples:
-        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
-        - "How will El Nino be impacted by climate change" -> ["el nino","climate change"]
-        - "Is climate change a hoax" -> ["climate change","hoax"]
-        """
-    )
-def make_keywords_extraction_chain(llm):
-    openai_functions = [convert_to_openai_function(KeywordExtraction)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"KeywordExtraction"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain

climateqa/engine/chains/query_transformation.py DELETED Viewed

@@ -1,193 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-ROUTING_INDEX = {
-    "Vector":["IPCC","IPBES","IPOS"],
-    "OpenAlex":["OpenAlex"],
-}
-POSSIBLE_SOURCES = [y for values in ROUTING_INDEX.values() for y in values]
-# Prompt from the original paper https://arxiv.org/pdf/2305.14283
-# Query Rewriting for Retrieval-Augmented Large Language Models
-class QueryDecomposition(BaseModel):
-    """
-    Decompose the user query into smaller parts to think step by step to answer this question
-    Act as a simple planning agent
-    """
-    questions: List[str] = Field(
-        description="""
-        Think step by step to answer this question, and provide one or several search engine questions in English for knowledge that you need.
-        Suppose that the user is looking for information about climate change, energy, biodiversity, nature, and everything we can find the IPCC reports and scientific literature
-        - If it's already a standalone and explicit question, just return the reformulated question for the search engine
-        - If you need to decompose the question, output a list of maximum 2 to 3 questions
-    """
-    )
-class Location(BaseModel):
-    country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-    location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-class QueryAnalysis(BaseModel):
-    """
-    Analyzing the user query to extract topics, sources and date
-    Also do query expansion to get alternative search queries
-    Also provide simple keywords to feed a search engine
-    """
-    # keywords: List[str] = Field(
-    #     description="""
-    #     Extract the keywords from the user query to feed a search engine as a list
-    #     Maximum 3 keywords
-    #     Examples:
-    #     - "What is the impact of deep sea mining ?" -> deep sea mining
-    #     - "How will El Nino be impacted by climate change" -> el nino;climate change
-    #     - "Is climate change a hoax" -> climate change;hoax
-    #     """
-    # )
-    # alternative_queries: List[str] = Field(
-    #     description="""
-    #     Generate alternative search questions from the user query to feed a search engine
-    #     """
-    # )
-    # step_back_question: str = Field(
-    #     description="""
-    #     You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.
-    #     This questions should help you get more context and information about the user query
-    #     """
-    # )
-    sources: List[Literal["IPCC", "IPBES", "IPOS","OpenAlex"]] = Field(
-        ...,
-        description="""
-            Given a user question choose which documents would be most relevant for answering their question,
-            - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-            - IPBES is for questions about biodiversity and nature
-            - IPOS is for questions about the ocean and deep sea mining
-            - OpenAlex is for any other questions that are not in the previous categories but could be found in the scientific litterature
-        """,
-    )
-    # topics: List[Literal[
-    #     "Climate change",
-    #     "Biodiversity",
-    #     "Energy",
-    #     "Decarbonization",
-    #     "Climate science",
-    #     "Nature",
-    #     "Climate policy and justice",
-    #     "Oceans",
-    #     "Deep sea mining",
-    #     "ESG and regulations",
-    #     "CSRD",
-    # ]] = Field(
-    #     ...,
-    #     description = """
-    #         Choose the topics that are most relevant to the user query, ex: Climate change, Energy, Biodiversity, ...
-    #     """,
-    # )
-    # date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-    # location:Location
-def make_query_decomposition_chain(llm):
-    openai_functions = [convert_to_openai_function(QueryDecomposition)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryDecomposition"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_rewriter_chain(llm):
-    openai_functions = [convert_to_openai_function(QueryAnalysis)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will analyze, translate and reformulate the user input message using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_query_transform_node(llm,k_final=15):
-    decomposition_chain = make_query_decomposition_chain(llm)
-    rewriter_chain = make_query_rewriter_chain(llm)
-    def transform_query(state):
-        if "sources_auto" not in state or state["sources_auto"] is None or state["sources_auto"] is False:
-            auto_mode = False
-        else:
-            auto_mode = True
-        sources_input = state.get("sources_input")
-        if sources_input is None: sources_input = ROUTING_INDEX["Vector"]
-        new_state = {}
-        # Decomposition
-        decomposition_output = decomposition_chain.invoke({"input":state["query"]})
-        new_state.update(decomposition_output)
-        # Query Analysis
-        questions = []
-        for question in new_state["questions"]:
-            question_state = {"question":question}
-            analysis_output = rewriter_chain.invoke({"input":question})
-            question_state.update(analysis_output)
-            questions.append(question_state)
-        # Explode the questions into multiple questions with different sources
-        new_questions = []
-        for q in questions:
-            question,sources = q["question"],q["sources"]
-            # If not auto mode we take the configuration
-            if not auto_mode:
-                sources = sources_input
-            for index,index_sources in ROUTING_INDEX.items():
-                selected_sources = list(set(sources).intersection(index_sources))
-                if len(selected_sources) > 0:
-                    new_questions.append({"question":question,"sources":selected_sources,"index":index})
-        # # Add the number of questions to search
-        # k_by_question = k_final // len(new_questions)
-        # for q in new_questions:
-        #     q["k"] = k_by_question
-        # new_state["questions"] = new_questions
-        # new_state["remaining_questions"] = new_questions
-        new_state = {
-            "remaining_questions":new_questions,
-            "n_questions":len(new_questions),
-        }
-        return new_state
-    return transform_query

climateqa/engine/chains/retrieve_documents.py DELETED Viewed

@@ -1,159 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain_core.tools import tool
-from langchain_core.runnables import chain
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough
-from langchain_core.runnables import RunnableLambda
-from ..reranker import rerank_docs
-from ...knowledge.retriever import ClimateQARetriever
-from ...knowledge.openalex import OpenAlexRetriever
-from .keywords_extraction import make_keywords_extraction_chain
-from ..utils import log_event
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-@tool
-def query_retriever(question):
-    """Just a dummy tool to simulate the retriever query"""
-    return question
-def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
-    @chain
-    async def retrieve_documents(state,config):
-        keywords_extraction = make_keywords_extraction_chain(llm)
-        current_question = state["remaining_questions"][0]
-        remaining_questions = state["remaining_questions"][1:]
-        # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
-        # # There are several options to get the final top k
-        # # Option 1 - Get 100 documents by question and rerank by question
-        # # Option 2 - Get 100/n documents by question and rerank the total
-        # if rerank_by_question:
-        #     k_by_question = divide_into_parts(k_final,len(questions))
-        # docs = state["documents"]
-        # if docs is None: docs = []
-        docs = []
-        k_by_question = k_final // state["n_questions"]
-        sources = current_question["sources"]
-        question = current_question["question"]
-        index = current_question["index"]
-        await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-        if index == "Vector":
-            # Search the document store using the retriever
-            # Configure high top k for further reranking step
-            retriever = ClimateQARetriever(
-                vectorstore=vectorstore,
-                sources = sources,
-                min_size = 200,
-                k_summary = k_summary,
-                k_total = k_before_reranking,
-                threshold = 0.5,
-            )
-            docs_question = await retriever.ainvoke(question,config)
-        elif index == "OpenAlex":
-            keywords = keywords_extraction.invoke(question)["keywords"]
-            openalex_query = " AND ".join(keywords)
-            print(f"... OpenAlex query: {openalex_query}")
-            retriever_openalex = OpenAlexRetriever(
-                min_year = state.get("min_year",1960),
-                max_year = state.get("max_year",None),
-                k = k_before_reranking
-            )
-            docs_question = await retriever_openalex.ainvoke(openalex_query,config)
-        else:
-            raise Exception(f"Index {index} not found in the routing index")
-        # Rerank
-        if reranker is not None:
-            with suppress_output():
-                docs_question = rerank_docs(reranker,docs_question,question)
-        else:
-            # Add a default reranking score
-            for doc in docs_question:
-                doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-        # If rerank by question we select the top documents for each question
-        if rerank_by_question:
-            docs_question = docs_question[:k_by_question]
-        # Add sources used in the metadata
-        for doc in docs_question:
-            doc.metadata["sources_used"] = sources
-            doc.metadata["question_used"] = question
-            doc.metadata["index_used"] = index
-        # Add to the list of docs
-        docs.extend(docs_question)
-        # Sorting the list in descending order by rerank_score
-        docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-        new_state = {"documents":docs,"remaining_questions":remaining_questions}
-        return new_state
-    return retrieve_documents

climateqa/engine/chains/sample_router.py DELETED Viewed

@@ -1,66 +0,0 @@
-# from typing import List
-# from typing import Literal
-# from langchain.prompts import ChatPromptTemplate
-# from langchain_core.utils.function_calling import convert_to_openai_function
-# from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-# # https://livingdatalab.com/posts/2023-11-05-openai-function-calling-with-langchain.html
-# class Location(BaseModel):
-#     country:str = Field(...,description="The country if directly mentioned or inferred from the location (cities, regions, adresses), ex: France, USA, ...")
-#     location:str = Field(...,description="The specific place if mentioned (cities, regions, addresses), ex: Marseille, New York, Wisconsin, ...")
-# class QueryAnalysis(BaseModel):
-#     """Analyzing the user query"""
-#     language: str = Field(
-#         description="Find the language of the query in full words (ex: French, English, Spanish, ...), defaults to English"
-#     )
-#     intent: str = Field(
-#         enum=[
-#             "Environmental impacts of AI",
-#             "Geolocated info about climate change",
-#             "Climate change",
-#             "Biodiversity",
-#             "Deep sea mining",
-#             "Chitchat",
-#         ],
-#         description="""
-#             Categorize the user query in one of the following category,
-#             Examples:
-#             - Geolocated info about climate change: "What will be the temperature in Marseille in 2050"
-#             - Climate change: "What is radiative forcing", "How much will
-#         """,
-#     )
-#     sources: List[Literal["IPCC", "IPBES", "IPOS"]] = Field(
-#         ...,
-#         description="""
-#             Given a user question choose which documents would be most relevant for answering their question,
-#             - IPCC is for questions about climate change, energy, impacts, and everything we can find the IPCC reports
-#             - IPBES is for questions about biodiversity and nature
-#             - IPOS is for questions about the ocean and deep sea mining
-#         """,
-#     )
-#     date: str = Field(description="The date or period mentioned, ex: 2050, between 2020 and 2050")
-#     location:Location
-#     # query: str = Field(
-#     #     description = """
-#     #         Translate to english and reformulate the following user message to be a short standalone question, in the context of an educational discussion about climate change.
-#     #         The reformulated question will used in a search engine
-#     #         By default, assume that the user is asking information about the last century,
-#     #         Use the following examples
-#     #         ### Examples:
-#     #         La technologie nous sauvera-t-elle ? -> Can technology help humanity mitigate the effects of climate change?
-#     #         what are our reserves in fossil fuel? -> What are the current reserves of fossil fuels and how long will they last?
-#     #         what are the main causes of climate change? -> What are the main causes of climate change in the last century?
-#     #         Question in English:
-#     #     """
-#     # )
-# openai_functions = [convert_to_openai_function(QueryAnalysis)]
-# llm2 = llm.bind(functions = openai_functions,function_call={"name":"QueryAnalysis"})

climateqa/engine/chains/translation.py DELETED Viewed

@@ -1,41 +0,0 @@
-from langchain_core.pydantic_v1 import BaseModel, Field
-from typing import List
-from typing import Literal
-from langchain.prompts import ChatPromptTemplate
-from langchain_core.utils.function_calling import convert_to_openai_function
-from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
-class Translation(BaseModel):
-    """Analyzing the user message input"""
-    translation: str = Field(
-        description="Translate the message input to English",
-    )
-def make_translation_chain(llm):
-    openai_functions = [convert_to_openai_function(Translation)]
-    llm_with_functions = llm.bind(functions = openai_functions,function_call={"name":"Translation"})
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a helpful assistant, you will translate the user input message to English using the function provided"),
-        ("user", "input: {input}")
-    ])
-    chain = prompt | llm_with_functions | JsonOutputFunctionsParser()
-    return chain
-def make_translation_node(llm):
-    translation_chain = make_translation_chain(llm)
-    def translate_query(state):
-        user_input = state["user_input"]
-        translation = translation_chain.invoke({"input":user_input})
-        return {"query":translation["translation"]}
-    return translate_query

climateqa/engine/embeddings.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
-def get_embeddings_function(version = "v1.2",query_instruction = "Represent this sentence for searching relevant passages: "):
     if version == "v1.2":
@@ -10,12 +10,12 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
-        encode_kwargs = {'normalize_embeddings': True,"show_progress_bar":False} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
-            query_instruction=query_instruction,
         )
     else:
@@ -23,6 +23,3 @@ def get_embeddings_function(version = "v1.2",query_instruction = "Represent this
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2"):
     if version == "v1.2":
         # Best embedding model at a reasonable size at the moment (2023-11-22)
         model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
         print("Loading embeddings model: ", model_name)
         embeddings_function = HuggingFaceBgeEmbeddings(
             model_name=model_name,
             encode_kwargs=encode_kwargs,
+            query_instruction="Represent this sentence for searching relevant passages: "
         )
     else:
         embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
     return embeddings_function

climateqa/engine/graph.py DELETED Viewed

@@ -1,149 +0,0 @@
-import sys
-import os
-from contextlib import contextmanager
-from langchain.schema import Document
-from langgraph.graph import END, StateGraph
-from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod
-from typing_extensions import TypedDict
-from typing import List
-from IPython.display import display, HTML, Image
-from .chains.answer_chitchat import make_chitchat_node
-from .chains.answer_ai_impact import make_ai_impact_node
-from .chains.query_transformation import make_query_transform_node
-from .chains.translation import make_translation_node
-from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retrieve_documents import make_retriever_node
-from .chains.answer_rag import make_rag_node
-class GraphState(TypedDict):
-    """
-    Represents the state of our graph.
-    """
-    user_input : str
-    language : str
-    intent : str
-    query: str
-    remaining_questions : List[dict]
-    n_questions : int
-    answer: str
-    audience: str = "experts"
-    sources_input: List[str] = ["IPCC","IPBES"]
-    sources_auto: bool = True
-    min_year: int = 1960
-    max_year: int = None
-    documents: List[Document]
-def search(state): #TODO
-    return state
-def answer_search(state):#TODO
-    return state
-def route_intent(state):
-    intent = state["intent"]
-    if intent in ["chitchat","esg"]:
-        return "answer_chitchat"
-    # elif intent == "ai_impact":
-    #     return "answer_ai_impact"
-    else:
-        # Search route
-        return "search"
-def route_translation(state):
-    if state["language"].lower() == "english":
-        return "transform_query"
-    else:
-        return "translate_query"
-def route_based_on_relevant_docs(state,threshold_docs=0.2):
-    docs = [x for x in state["documents"] if x.metadata["reranking_score"] > threshold_docs]
-    if len(docs) > 0:
-        return "answer_rag"
-    else:
-        return "answer_rag_no_docs"
-def make_id_dict(values):
-    return {k:k for k in values}
-def make_graph_agent(llm,vectorstore,reranker,threshold_docs = 0.2):
-    workflow = StateGraph(GraphState)
-    # Define the node functions
-    categorize_intent = make_intent_categorization_node(llm)
-    transform_query = make_query_transform_node(llm)
-    translate_query = make_translation_node(llm)
-    answer_chitchat = make_chitchat_node(llm)
-    answer_ai_impact = make_ai_impact_node(llm)
-    retrieve_documents = make_retriever_node(vectorstore,reranker,llm)
-    answer_rag = make_rag_node(llm,with_docs=True)
-    answer_rag_no_docs = make_rag_node(llm,with_docs=False)
-    # Define the nodes
-    workflow.add_node("categorize_intent", categorize_intent)
-    workflow.add_node("search", search)
-    workflow.add_node("answer_search", answer_search)
-    workflow.add_node("transform_query", transform_query)
-    workflow.add_node("translate_query", translate_query)
-    workflow.add_node("answer_chitchat", answer_chitchat)
-    # workflow.add_node("answer_ai_impact", answer_ai_impact)
-    workflow.add_node("retrieve_documents",retrieve_documents)
-    workflow.add_node("answer_rag",answer_rag)
-    workflow.add_node("answer_rag_no_docs",answer_rag_no_docs)
-    # Entry point
-    workflow.set_entry_point("categorize_intent")
-    # CONDITIONAL EDGES
-    workflow.add_conditional_edges(
-        "categorize_intent",
-        route_intent,
-        make_id_dict(["answer_chitchat","search"])
-    )
-    workflow.add_conditional_edges(
-        "search",
-        route_translation,
-        make_id_dict(["translate_query","transform_query"])
-    )
-    workflow.add_conditional_edges(
-        "retrieve_documents",
-        lambda state : "retrieve_documents" if len(state["remaining_questions"]) > 0 else "answer_search",
-        make_id_dict(["retrieve_documents","answer_search"])
-    )
-    workflow.add_conditional_edges(
-        "answer_search",
-        lambda x : route_based_on_relevant_docs(x,threshold_docs=threshold_docs),
-        make_id_dict(["answer_rag","answer_rag_no_docs"])
-    )
-    # Define the edges
-    workflow.add_edge("translate_query", "transform_query")
-    workflow.add_edge("transform_query", "retrieve_documents")
-    workflow.add_edge("answer_rag", END)
-    workflow.add_edge("answer_rag_no_docs", END)
-    workflow.add_edge("answer_chitchat", END)
-    # workflow.add_edge("answer_ai_impact", END)
-    # Compile
-    app = workflow.compile()
-    return app
-def display_graph(app):
-    display(
-        Image(
-            app.get_graph(xray = True).draw_mermaid_png(
-                draw_method=MermaidDrawMethod.API,
-            )
-        )
-    )

climateqa/engine/llm/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
-from climateqa.engine.llm.ollama import get_llm as get_ollama_llm
 def get_llm(provider="openai",**kwargs):
@@ -9,8 +8,6 @@ def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
-    elif provider == "ollama":
-        return  get_ollama_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

 from climateqa.engine.llm.openai import get_llm as get_openai_llm
 from climateqa.engine.llm.azure import get_llm as get_azure_llm
 def get_llm(provider="openai",**kwargs):
         return get_openai_llm(**kwargs)
     elif provider == "azure":
         return get_azure_llm(**kwargs)
     else:
         raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/ollama.py DELETED Viewed

@@ -1,6 +0,0 @@
-from langchain_community.llms import Ollama
-def get_llm(model="llama3", **kwargs):
-    return Ollama(model=model, **kwargs)

climateqa/engine/{chains/prompts.py → prompts.py} RENAMED Viewed

@@ -56,7 +56,7 @@ Passages:
 {context}
 -----------------------
-Question: {query} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
@@ -137,7 +137,7 @@ Guidelines:
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
-Question: {query}
 Answer in {language}:
 """

 {context}
 -----------------------
+Question: {question} - Explained to {audience}
 Answer in {language} with the passages citations:
 """
 - If the question is not related to environmental issues, never never answer it. Say it's not your role.
 - Make paragraphs by starting new lines to make your answers more readable.
+Question: {question}
 Answer in {language}:
 """

climateqa/engine/rag.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from operator import itemgetter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts.base import format_document
+from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template,answer_prompt_without_docs_template,answer_prompt_images_template
+from climateqa.engine.prompts import papers_prompt_template
+from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
+from climateqa.engine.keywords import make_keywords_chain
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
+def _combine_documents(
+    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
+):
+    doc_strings =  []
+    for i,doc in enumerate(docs):
+        # chunk_type = "Doc" if doc.metadata["chunk_type"] == "text" else "Image"
+        chunk_type = "Doc"
+        if isinstance(doc,str):
+            doc_formatted = doc
+        else:
+            doc_formatted = format_document(doc, document_prompt)
+        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
+        doc_string = doc_string.replace("\n"," ")
+        doc_strings.append(doc_string)
+    return sep.join(doc_strings)
+def get_text_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "text"]
+def get_image_docs(x):
+    return [doc for doc in x if doc.metadata["chunk_type"] == "image"]
+def make_rag_chain(retriever,llm):
+    # Construct the prompt
+    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+    prompt_without_docs = ChatPromptTemplate.from_template(answer_prompt_without_docs_template)
+    # ------- CHAIN 0 - Reformulation
+    reformulation = make_reformulation_chain(llm)
+    reformulation = prepare_chain(reformulation,"reformulation")
+    # ------- Find all keywords from the reformulated query
+    keywords = make_keywords_chain(llm)
+    keywords = {"keywords":itemgetter("question") | keywords}
+    keywords = prepare_chain(keywords,"keywords")
+    # ------- CHAIN 1
+    # Retrieved documents
+    find_documents = {"docs": itemgetter("question") | retriever} | RunnablePassthrough()
+    find_documents = prepare_chain(find_documents,"find_documents")
+    # ------- CHAIN 2
+    # Construct inputs for the llm
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","audience","language","keywords"])
+    }
+    # ------- CHAIN 3
+    # Bot answer
+    llm_final = rename_chain(llm,"answer")
+    answer_with_docs = {
+        "answer": input_documents | prompt | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    answer_without_docs = {
+        "answer":  prompt_without_docs | llm_final | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs","keywords"]),
+    }
+    # def has_images(x):
+    #     image_docs = [doc for doc in x["docs"] if doc.metadata["chunk_type"]=="image"]
+    #     return len(image_docs) > 0
+    def has_docs(x):
+        return len(x["docs"]) > 0
+    answer = RunnableBranch(
+        (lambda x: has_docs(x), answer_with_docs),
+        answer_without_docs,
+    )
+    # ------- FINAL CHAIN
+    # Build the final chain
+    rag_chain = reformulation | keywords | find_documents | answer
+    return rag_chain
+def make_rag_papers_chain(llm):
+    prompt = ChatPromptTemplate.from_template(papers_prompt_template)
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","language"])
+    }
+    chain = input_documents | prompt | llm | StrOutputParser()
+    chain = rename_chain(chain,"answer")
+    return chain
+def make_illustration_chain(llm):
+    prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
+    input_description_images = {
+        "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
+        **pass_values(["question","audience","language","answer"]),
+    }
+    illustration_chain = input_description_images | prompt_with_images | llm | StrOutputParser()
+    return illustration_chain

climateqa/engine/{chains/reformulation.py → reformulation.py} RENAMED Viewed

@@ -3,7 +3,7 @@ from langchain.output_parsers.structured import StructuredOutputParser, Response
 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
-from climateqa.engine.chains.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

 from langchain_core.prompts import PromptTemplate
 from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.prompts import reformulation_prompt_template
 from climateqa.engine.utils import pass_values, flatten_dict

climateqa/engine/reranker.py DELETED Viewed

@@ -1,40 +0,0 @@
-import os
-from scipy.special import expit, logit
-from rerankers import Reranker
-def get_reranker(model = "nano",cohere_api_key = None):
-    assert model in ["nano","tiny","small","large"]
-    if model == "nano":
-        reranker = Reranker('ms-marco-TinyBERT-L-2-v2', model_type='flashrank')
-    elif model == "tiny":
-        reranker = Reranker('ms-marco-MiniLM-L-12-v2', model_type='flashrank')
-    elif model == "small":
-        reranker = Reranker("mixedbread-ai/mxbai-rerank-xsmall-v1", model_type='cross-encoder')
-    elif model == "large":
-        if cohere_api_key is None:
-            cohere_api_key = os.environ["COHERE_API_KEY"]
-        reranker = Reranker("cohere", lang='en', api_key = cohere_api_key)
-    return reranker
-def rerank_docs(reranker,docs,query):
-    # Get a list of texts from langchain docs
-    input_docs = [x.page_content for x in docs]
-    # Rerank using rerankers library
-    results = reranker.rank(query=query, docs=input_docs)
-    # Prepare langchain list of docs
-    docs_reranked = []
-    for result in results.results:
-        doc_id = result.document.doc_id
-        doc = docs[doc_id]
-        doc.metadata["reranking_score"] = result.score
-        doc.metadata["query_used_for_retrieval"] = query
-        docs_reranked.append(doc)
-    return docs_reranked

climateqa/{knowledge → engine}/retriever.py RENAMED Viewed

@@ -66,7 +66,6 @@ class ClimateQARetriever(BaseRetriever):
         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
-            doc.page_content = doc.page_content.replace("\r\n"," ")
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
@@ -79,3 +78,86 @@ class ClimateQARetriever(BaseRetriever):
         return results

         # Add score to metadata
         results = []
         for i,(doc,score) in enumerate(docs):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         return results
+# def filter_summaries(df,k_summary = 3,k_total = 10):
+#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
+#     # # Filter by source
+#     # if source == "IPCC":
+#     #     df = df.loc[df["source"]=="IPCC"]
+#     # elif source == "IPBES":
+#     #     df = df.loc[df["source"]=="IPBES"]
+#     # else:
+#     #     pass
+#     # Separate summaries and full reports
+#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
+#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
+#     # Find passages from summaries dataset
+#     passages_summaries = df_summaries.head(k_summary)
+#     # Find passages from full reports dataset
+#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
+#     # Concatenate passages
+#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
+#     return passages
+# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
+#     assert max_k > k_total
+#     validated_sources = ["IPCC","IPBES"]
+#     sources = [x for x in sources if x in validated_sources]
+#     filters = {
+#         "source": { "$in": sources },
+#     }
+#     print(filters)
+#     # Retrieve documents
+#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
+#     # Filter by score
+#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
+#     if len(docs) == 0:
+#         return []
+#     res = pd.DataFrame(docs)
+#     passages_df = filter_summaries(res,k_summary,k_total)
+#     if as_dict:
+#         contents = passages_df["content"].tolist()
+#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
+#         passages = []
+#         for i in range(len(contents)):
+#             passages.append({"content":contents[i],"meta":meta[i]})
+#         return passages
+#     else:
+#         return passages_df
+# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
+#     print("hellooooo")
+#     # Reformulate queries
+#     reformulated_query,language = reformulate(query)
+#     print(reformulated_query)
+#     # Retrieve documents
+#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
+#     response = {
+#       "query":query,
+#       "reformulated_query":reformulated_query,
+#       "language":language,
+#       "sources":passages,
+#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
+#     }
+#     return response

climateqa/engine/utils.py CHANGED Viewed

@@ -1,15 +1,8 @@
 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
-import tiktoken
 from langchain_core.runnables import RunnablePassthrough
-def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    return num_tokens
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
@@ -74,13 +67,3 @@ def flatten_dict(
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict
-async def log_event(info,name,config):
-    """Helper function that will run a dummy chain with the given info
-    The astream_event function will catch this chain and stream the dict info to the logger
-    """
-    chain = RunnablePassthrough().with_config(run_name=name)
-    _ = await chain.ainvoke(info,config)

 from operator import itemgetter
 from typing import Any, Dict, Iterable, Tuple
 from langchain_core.runnables import RunnablePassthrough
 def pass_values(x):
     if not isinstance(x, list):
         x = [x]
     """
     flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
     return flat_dict

climateqa/knowledge/__init__.py DELETED Viewed

File without changes

climateqa/papers/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pandas as pd
+from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
+import pyalex
+pyalex.config.email = "[email protected]"
+class OpenAlex():
+    def __init__(self):
+        pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
+        works = Works().search(keywords).get()
+        for page in works.paginate(per_page=n_results):
+            break
+        df_works = pd.DataFrame(page)
+        return works
+    def make_network(self):
+        pass
+    def get_abstract_from_inverted_index(self,index):
+        # Determine the maximum index to know the length of the reconstructed array
+        max_index = max([max(positions) for positions in index.values()])
+        # Initialize a list with placeholders for all positions
+        reconstructed = [''] * (max_index + 1)
+        # Iterate through the inverted index and place each token at its respective position(s)
+        for token, positions in index.items():
+            for position in positions:
+                reconstructed[position] = token
+        # Join the tokens to form the reconstructed sentence(s)
+        return ' '.join(reconstructed)

climateqa/{knowledge → papers}/openalex.py RENAMED Viewed

@@ -3,32 +3,18 @@ import networkx as nx
 import matplotlib.pyplot as plt
 from pyvis.network import Network
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.vectorstores import VectorStoreRetriever
-from langchain_core.documents.base import Document
-from langchain_core.vectorstores import VectorStore
-from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
-from ..engine.utils import num_tokens_from_string
-from typing import List
-from pydantic import Field
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "[email protected]"
-def replace_nan_with_empty_dict(x):
-    return x if pd.notna(x) else {}
 class OpenAlex():
     def __init__(self):
         pass
-    def search(self,keywords:str,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
@@ -41,21 +27,18 @@ class OpenAlex():
                 break
             df_works = pd.DataFrame(page)
-            df_works = df_works.dropna(subset = ["title"])
-            df_works["primary_location"] = df_works["primary_location"].map(replace_nan_with_empty_dict)
-            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x)).fillna("")
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
-            df_works["url"] = df_works["id"]
-            df_works["content"] = (df_works["title"] + "\n" + df_works["abstract"]).map(lambda x : x.strip())
-            df_works["num_tokens"] = df_works["content"].map(lambda x : num_tokens_from_string(x))
-            df_works = df_works.drop(columns = ["abstract_inverted_index"])
-            # df_works["subtitle"] = df_works["title"] + " - " + df_works["primary_location"]["source"]["display_name"] + " - " + df_works["publication_year"]
-            return df_works
         else:
-           raise Exception("Keywords must be a string")
     def rerank(self,query,df,reranker):
@@ -156,36 +139,4 @@ class OpenAlex():
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
-            return ' '.join(reconstructed)
-class OpenAlexRetriever(BaseRetriever):
-    min_year:int = 1960
-    max_year:int = None
-    k:int = 100
-    def _get_relevant_documents(
-        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
-    ) -> List[Document]:
-        openalex = OpenAlex()
-        # Search for documents
-        df_docs = openalex.search(query,n_results=self.k,after = self.min_year,before = self.max_year)
-        docs = []
-        for i,row in df_docs.iterrows():
-            num_tokens = row["num_tokens"]
-            if num_tokens < 50 or num_tokens > 1000:
-                continue
-            doc = Document(
-                page_content = row["content"],
-                metadata = row.to_dict()
-            )
-            docs.append(doc)
-        return docs

 import matplotlib.pyplot as plt
 from pyvis.network import Network
 from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
 import pyalex
 pyalex.config.email = "[email protected]"
 class OpenAlex():
     def __init__(self):
         pass
+    def search(self,keywords,n_results = 100,after = None,before = None):
         if isinstance(keywords,str):
             works = Works().search(keywords)
                 break
             df_works = pd.DataFrame(page)
+            df_works["abstract"] = df_works["abstract_inverted_index"].apply(lambda x: self.get_abstract_from_inverted_index(x))
             df_works["is_oa"] = df_works["open_access"].map(lambda x : x.get("is_oa",False))
             df_works["pdf_url"] = df_works["primary_location"].map(lambda x : x.get("pdf_url",None))
+            df_works["content"] = df_works["title"] + "\n" + df_works["abstract"]
         else:
+            df_works = []
+            for keyword in keywords:
+                df_keyword = self.search(keyword,n_results = n_results,after = after,before = before)
+                df_works.append(df_keyword)
+            df_works = pd.concat(df_works,ignore_index=True,axis = 0)
+        return df_works
     def rerank(self,query,df,reranker):
                     reconstructed[position] = token
             # Join the tokens to form the reconstructed sentence(s)
+            return ' '.join(reconstructed)

front/__init__.py DELETED Viewed

File without changes

front/callbacks.py DELETED Viewed

File without changes

front/utils.py DELETED Viewed

@@ -1,142 +0,0 @@
-import re
-def make_pairs(lst):
-    """from a list of even lenght, make tupple pairs"""
-    return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)]
-def serialize_docs(docs):
-    new_docs = []
-    for doc in docs:
-        new_doc = {}
-        new_doc["page_content"] = doc.page_content
-        new_doc["metadata"] = doc.metadata
-        new_docs.append(new_doc)
-    return new_docs
-def parse_output_llm_with_sources(output):
-    # Split the content into a list of text and "[Doc X]" references
-    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
-    parts = []
-    for part in content_parts:
-        if part.startswith("Doc"):
-            subparts = part.split(",")
-            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
-            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup>{subpart}</sup></span></a>""" for subpart in subparts]
-            parts.append("".join(subparts))
-        else:
-            parts.append(part)
-    content_parts = "".join(parts)
-    return content_parts
-def make_html_source(source,i):
-    meta = source.metadata
-    # content = source.page_content.split(":",1)[1].strip()
-    content = source.page_content.strip()
-    toc_levels = []
-    for j in range(2):
-        level = meta[f"toc_level{j}"]
-        if level != "N/A":
-            toc_levels.append(level)
-        else:
-            break
-    toc_levels = " > ".join(toc_levels)
-    if len(toc_levels) > 0:
-        name = f"<b>{toc_levels}</b><br/>{meta['name']}"
-    else:
-        name = meta['name']
-    score = meta['reranking_score']
-    if score > 0.8:
-        color = "score-green"
-    elif score > 0.4:
-        color = "score-orange"
-    else:
-        color = "score-red"
-    relevancy_score = f"<p class=relevancy-score>Relevancy score: <span class='{color}'>{score:.1%}</span></p>"
-    if meta["chunk_type"] == "text":
-        card = f"""
-    <div class="card" id="doc{i}">
-        <div class="card-content">
-            <h2>Doc {i} - {meta['short_name']} - Page {int(meta['page_number'])}</h2>
-            <p>{content}</p>
-            {relevancy_score}
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    else:
-        if meta["figure_code"] != "N/A":
-            title = f"{meta['figure_code']} - {meta['short_name']}"
-        else:
-            title = f"{meta['short_name']}"
-        card = f"""
-    <div class="card card-image">
-        <div class="card-content">
-            <h2>Image {i} - {title} - Page {int(meta['page_number'])}</h2>
-            <p>{content}</p>
-            <p class='ai-generated'>AI-generated description</p>
-            {relevancy_score}
-        </div>
-        <div class="card-footer">
-            <span>{name}</span>
-            <a href="{meta['url']}#page={int(meta['page_number'])}" target="_blank" class="pdf-link">
-                <span role="img" aria-label="Open PDF">🔗</span>
-            </a>
-        </div>
-    </div>
-    """
-    return card
-def make_toolbox(tool_name,description = "",checked = False,elem_id = "toggle"):
-    if checked:
-        span = "<span class='checkmark'>&#10003;</span>"
-    else:
-        span = "<span class='loader'></span>"
-#     toolbox = f"""
-# <div class="dropdown">
-# <label for="{elem_id}" class="dropdown-toggle">
-#     {span}
-#     {tool_name}
-#     <span class="caret"></span>
-# </label>
-# <input type="checkbox" id="{elem_id}" hidden/>
-# <div class="dropdown-content">
-#     <p>{description}</p>
-# </div>
-# </div>
-# """
-    toolbox = f"""
-<div class="dropdown">
-<label for="{elem_id}" class="dropdown-toggle">
-    {span}
-    {tool_name}
-</label>
-</div>
-"""
-    return toolbox

requirements.txt CHANGED Viewed

@@ -1,20 +1,13 @@
-gradio==5.0.2
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
-langchain==0.2.1
-langchain_openai==0.1.7
-langgraph==0.0.55
-pinecone-client==4.1.0
 sentence-transformers==2.6.0
 huggingface-hub
 pyalex==0.13
 networkx==3.2.1
-pyvis==0.3.2
-flashrank==0.2.5
-rerankers==0.3.0
-torch==2.3.0
-nvidia-cudnn-cu12==8.9.2.26
-langchain-community==0.2
-msal==1.31
-matplotlib==3.9.2

+gradio==4.19.1
 azure-storage-file-share==12.11.1
 azure-storage-blob
 python-dotenv==1.0.0
+langchain==0.1.4
+langchain_openai==0.0.6
+pinecone-client==3.0.2
 sentence-transformers==2.6.0
 huggingface-hub
+msal
 pyalex==0.13
 networkx==3.2.1
+pyvis==0.3.2

sandbox/20240310 - CQA - Semantic Routing 1.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

style.css CHANGED Viewed

@@ -2,14 +2,6 @@
 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
-.avatar-container.svelte-1x5p6hu:not(.thumbnail-item) img {
-    width: 100%;
-    height: 100%;
-    object-fit: cover;
-    border-radius: 50%;
-    padding: 0px;
-    margin: 0px;
-}
 .warning-box {
     background-color: #fff3cd;
@@ -65,7 +57,6 @@ body.dark .tip-box * {
 .message{
     font-size:14px !important;
 }
@@ -74,10 +65,6 @@ a {
     color: inherit;
 }
-.doc-ref sup{
-    color:#dc2626!important;
-    /* margin-right:1px; */
-}
 .card {
     background-color: white;
     border-radius: 10px;
@@ -376,108 +363,3 @@ span.chatbot > p > img{
 .a-doc-ref{
 	text-decoration: none !important;
 }
-.dropdown {
-    position: relative;
-    display:inline-block;
-    margin-bottom: 10px;
-  }
-  .dropdown-toggle {
-    background-color: #f2f2f2;
-    color: black;
-    padding: 10px;
-    font-size: 16px;
-    cursor: pointer;
-    display: block;
-    width: 400px; /* Adjust width as needed */
-    position: relative;
-    display: flex;
-    align-items: center; /* Vertically center the contents */
-    justify-content: left;
-  }
-  .dropdown-toggle .caret {
-    content: "";
-    position: absolute;
-    right: 10px;
-    top: 50%;
-    border-left: 5px solid transparent;
-    border-right: 5px solid transparent;
-    border-top: 5px solid black;
-    transform: translateY(-50%);
-  }
-  input[type="checkbox"] {
-    display: none !important;
-  }
-  input[type="checkbox"]:checked + .dropdown-content {
-    display: block;
-  }
-  .dropdown-content {
-    display: none;
-    position: absolute;
-    background-color: #f9f9f9;
-    min-width: 300px;
-    box-shadow: 0 8px 16px 0 rgba(0,0,0,0.2);
-    z-index: 1;
-    padding: 12px;
-    border: 1px solid #ccc;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle + .dropdown-content {
-    display: block;
-  }
-  input[type="checkbox"]:checked + .dropdown-toggle .caret {
-    border-top: 0;
-    border-bottom: 5px solid black;
-  }
-  .loader {
-    border: 1px solid #d0d0d0 !important; /* Light grey background */
-    border-top: 1px solid #db3434 !important; /* Blue color */
-    border-right: 1px solid #3498db !important; /* Blue color */
-    border-radius: 50%;
-    width: 20px;
-    height: 20px;
-    animation: spin 2s linear infinite;
-    display:inline-block;
-    margin-right:10px !important;
-  }
-  .checkmark{
-    color:green !important;
-    font-size:18px;
-    margin-right:10px !important;
-  }
-  @keyframes spin {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
-  }
-  .relevancy-score{
-    margin-top:10px !important;
-    font-size:10px !important;
-    font-style:italic;
-  }
-  .score-green{
-    color:green !important;
-  }
-  .score-orange{
-    color:orange !important;
-  }
-  .score-orange{
-    color:red !important;
-  }
-.message-buttons-left.panel.message-buttons.with-avatar {
-    display: none;
-}

 /* :root {
     --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
   } */
 .warning-box {
     background-color: #fff3cd;
 .message{
     font-size:14px !important;
 }
     color: inherit;
 }
 .card {
     background-color: white;
     border-radius: 10px;
 .a-doc-ref{
 	text-decoration: none !important;
 }

test.json DELETED Viewed

File without changes