Spaces:

anirudhs
/

web-researcher

Sleeping

App Files Files Community

anirudhs commited on Jan 2

Commit

8fd59af

1 Parent(s): dab3f26

added researcher files

Browse files

Files changed (7) hide show

app.py +72 -4
improve_content.py +233 -0
llm_config.py +146 -0
prompts.py +282 -0
research_manager.py +475 -0
search.py +45 -0
ui.py +74 -0

app.py CHANGED Viewed

@@ -1,11 +1,79 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!"
-demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
 if __name__ == "__main__":
-    demo.launch()

+from colorama import init, Fore, Style
+from research_manager import ResearchManager
 import gradio as gr
+import os
+def run_research(topic_input, questions_input, section_length, groq_key):
+    research_task = {
+        "topic" : topic_input,
+        "key_questions" : questions_input.split(','),
+        "report_type" : "market study",
+        'section_length' : section_length,
+         "guidelines": [
+            "The report MUST fully answer all the questions",
+            "The report MUST only contain information that can be cited from a URL content",
+            "The report DOES NOT contain unverified information and contains only facts",
+        ],
+    }
+    os.environ['GEMINI_API_KEY'] = groq_key
+    # Initialize Research Manager
+    research_manager = ResearchManager(research_task)
+    yield from research_manager.start_research()
+def main():
+    with gr.Blocks(css="""
+        .report-container {
+            height: 90vh;
+            overflow-y: auto;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            padding: 1rem;
+        }
+    """) as demo:
+        with gr.Row():
+            with gr.Column():
+                topic_input = gr.Textbox(
+                    label="Research Topic",
+                    placeholder="Enter your research topic...",
+                    value="How is MS copilot performing in the enterprise search market ?"
+                )
+                questions_input = gr.Textbox(
+                    label="Key Questions (comma-separated)",
+                    placeholder="Enter key questions...",
+                    value="What are the user reviews?,  How is the pricing structure?, how does it compare against glean?"
+                )
+                section_length = gr.Slider(
+                    label="Section Length (words)",
+                    minimum=300,
+                    maximum=500,
+                    step=100,
+                    value=300)
+                groq_key = gr.Textbox(
+                    label="Cerebras API Key",
+                    info="#### Get your free Cerebras key from [cloud.cerebras.ai/](https://cloud.cerebras.ai/)",
+                    placeholder="Enter your Cerebras key...",
+                    value="your_groq_key"
+                )
+                start_btn = gr.Button("Start Research", variant="primary")
+                progress_output = gr.HTML(label="Progress Bar")
+            with gr.Column():
+                with gr.Column(elem_classes="report-container"):
+                    report_outline = gr.Markdown(label="Report Outline")
+        start_btn.click(
+            fn=run_research,
+            inputs=[topic_input, questions_input, section_length, groq_key],
+            outputs=[start_btn, progress_output, report_outline],
+            show_progress="bar"
+        )
+    demo.queue().launch()
 if __name__ == "__main__":
+    main()

improve_content.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from typing import List, TypedDict
+from llm_config import get_llm_instructor, call_llm
+from pydantic import BaseModel, Field
+import ui
+import prompts
+from search import fetch_search_results, format_search_results
+import random
+import time
+from dotenv import load_dotenv
+import re
+load_dotenv()
+class RoundtableMessage(BaseModel):
+    response: str = Field(..., title="Your response")
+    follow_up: str = Field(..., title="Your follow-up question")
+    next_persona: str = Field(..., title="Who you are asking the question to")
+class ContentState(TypedDict):
+    previous_messages: List[dict]
+    content: str
+    expert_question: str
+    iteration: int
+    full_messages: List[str]
+    refernces : str
+class Queries(BaseModel):
+    queries : List[str] = Field(..., title="List of queries to search for")
+class PersonaQuestion(BaseModel):
+    question: str = Field(..., title="Your question for the expert")
+class StrucutredAnswer(BaseModel):
+    answer_response: str = Field(..., title="The response to the question with citations")
+    references_used: List[int] = Field(..., title="The references used to answer the question")
+class ImproveContent:
+    def __init__(self, section_topic, section_description, section_key_questions, personas):
+        self.section_topic = section_topic
+        self.section_description = section_description
+        self.section_key_questions = section_key_questions
+        self.client = get_llm_instructor()
+        self.num_search_result = 1
+        self.num_interview_rounds = 3
+        self.personas = personas
+        self.warm_start_rounds = 10
+    # Define the initial state
+    def create_initial_state(self) -> ContentState:
+        return {
+            "expert_question": "",
+            "iteration": 0,
+            'previous_messages': [],
+            'full_messages': [],
+            'references' : ''
+        }
+    def expert_question_generator(self, persona, state: ContentState) -> ContentState:
+        response = call_llm(
+            instructions=prompts.QUALITY_CHECKER_INSTRUCTIONS,
+            additional_messages= state['previous_messages'],
+            context={
+                "title_description": self.section_description + ":" + self.section_topic,
+                "key_questions": self.section_key_questions,
+                'persona': persona.persona
+            },
+            response_model=PersonaQuestion,
+            logging_fn="quality_checker"
+        )
+        ui.system_sub_update("-------------------")
+        ui.system_sub_update(f'{persona.name} ({persona.role},{persona.affiliation}):')
+        ui.system_sub_update(response.question)
+        ui.system_sub_update("-------------------")
+        state["expert_question"] = response.question
+        state['previous_messages'].append({'role' : 'assistant', 'content': response.question})
+        state['full_messages'].append(response.question)
+        return state
+    def replace_references(self, text: str, references_list: List[int]) -> str:
+        """Helper method to replace bracketed references with unique numbering."""
+        for idx in references_list:
+            text = text.replace(f"[{idx}]", f"[{self.num_search_result}]")
+            self.num_search_result += 1
+        return text
+    def answer_question(self, persona, state: ContentState):
+        queries = call_llm(
+            instructions=prompts.IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS,
+            model_type='fast',
+            context={
+                "section_topic": self.section_topic,
+                "expert_question": state["expert_question"],
+                'persona': persona.persona
+            },
+            response_model=Queries,
+            logging_fn="improve_content_create_query"
+        )
+        search_results, search_results_list = yield from fetch_search_results(queries.queries, self.task_status, self.section_topic, self.update_ui_fn)
+        # Hit the search engine to fetch relevant documents
+        if search_results_list == []:
+            queries = call_llm(
+                instructions=prompts.IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS,
+                model_type='fast',
+                context={
+                    "section_topic": self.section_topic,
+                    "expert_question": state["expert_question"],
+                    'persona': persona.persona
+                },
+                response_model=Queries,
+                logging_fn="improve_content_create_query_fallback"
+                )
+            search_results, search_results_list = yield from fetch_search_results(queries.queries, self.task_status,self.section_topic, self.update_ui_fn)
+        response = call_llm(
+            instructions=prompts.IMPORVE_CONTENT_ANSWER_QUERY_INSTRUCTION,
+            model_type='rag',
+            context={
+                "section_topic": self.section_topic,
+                "expert_question": state["expert_question"],
+                "search_results": search_results,
+                'persona' : persona.persona
+            },
+            response_model=StrucutredAnswer,
+            logging_fn="improve_content_answer_query"
+        )
+        state["content"] =response.answer_response
+        references_used = format_search_results([search_results_list[i-1] for i in response.references_used])
+        # Find all unique bracketed references in the search results
+        bracketed_refs = re.findall(r'\[(\d+)\](?=\s*Title:)', search_results)
+        #Replace citations[2,3,4] with [2][3][4]
+        cited_references_raw = re.findall(r'\[(\d+(?:,\s*\d+)*)\]', response.answer_response)
+        for group in cited_references_raw:
+            nums_list = group.split(',')
+            new_string = ''.join(f'[{n.strip()}]' for n in nums_list)
+            old_string = f'[{group}]'
+            response.answer_response = response.answer_response.replace(old_string, new_string)
+        # Replace each reference number with its a unique search number
+        for ref in bracketed_refs:
+            search_results = search_results.replace(f'[{ref}]', f"[{self.num_search_result}]")
+            response.answer_response = response.answer_response.replace(f'[{ref}]', f"[{self.num_search_result}]")
+            self.num_search_result += 1
+        ui.system_sub_update("-------------------")
+        ui.system_sub_update('Content:')
+        ui.system_sub_update(response.answer_response)
+        ui.system_sub_update("-------------------")
+        state['previous_messages'].append({'role' : 'user', 'content' : response.answer_response})
+        state['full_messages'].append(response.answer_response)
+        state['references'] = state['references'] +  '\n\n' + search_results
+        state["iteration"] += 1
+        return state
+    def create_and_run_interview(self, task_status, update_ui_fn):
+        """Runs an iterative process of generating questions and answers
+           until the iteration limit is reached."""
+        self.task_status = task_status
+        self.update_ui_fn = update_ui_fn
+        discussion_messages = []
+        for persona in self.personas:
+            ui.system_update(f"Starting discussion with : {persona.name}: {persona.role}, {persona.affiliation}")
+            state = self.create_initial_state()
+            while state["iteration"] <= self.num_interview_rounds:
+                state = self.expert_question_generator(persona, state)
+                state = yield from self.answer_question(persona, state)
+            discussion_messages.extend(state['previous_messages'])
+        self.final_state = state
+        return discussion_messages
+    def generate_final_section(self, synopsis):
+        return '\n\n'.join(self.final_state['full_messages']), self.final_state['references']
+    def warm_start_discussion(self):
+        """Warm start the discussion with existing personas"""
+        messages = [f"{self.personas[0].name}: Hi! Let's get started!"]
+        selected_persona = random.choice(self.personas)
+        for _ in range(self.warm_start_rounds):
+            # Get the last 5 messages if there are more than 5
+            recent_messages = messages[-5:] if len(messages) > 5 else messages
+            message = call_llm(
+                instructions=prompts.ROUNDTABLE_DISCUSSION_INSTRUCTIONS,
+                model_type='fast',
+                context={
+                   "persona_name" : selected_persona.name,
+                   "persona_role" : selected_persona.role,
+                   "persona_affiliation" : selected_persona.affiliation,
+                   "persona_focus" : selected_persona.focus,
+                   "personas" :
+                        "\n\n".join([p.name + '\n' + p.persona for p in self.personas if p != selected_persona]),
+                   "discussion" : "\n\n".join(recent_messages)
+                        },
+                response_model=RoundtableMessage,
+                logging_fn="roundtable_discussion"
+                  )
+            ui.system_sub_update("\n\n" + selected_persona.name + ": " + message.response + '\n' + message.follow_up)
+            messages.append(selected_persona.name + ": " + message.response + '\n' + message.follow_up)
+            selected_persona = [p for p in self.personas if p.name == message.next_persona][0]
+            time.sleep(3)
+        return messages
+if __name__ == "__main__":
+    section_name = 'Glean Search in the Enterprise Search Market'
+    section_description = 'Positioning and Competition'
+    section_key_questions = ['how is glean positioned in the enterprise search market?', "who are the main competitors in this space?"]
+    personas = ['\nRole: Business Analyst\nAffiliation: Enterprise Software Consultant\nDescription: Specializes in helping organizations implement and optimize AI-powered tools for improved productivity and knowledge management. Will analyze Glean and Copilot from a business user perspective.\n']
+    improve_content = ImproveContent(section_name, section_description, section_key_questions, personas)
+    improved_content = improve_content.create_and_run_interview()
+    improve_content.generate_final_section()
+    print(improved_content)

llm_config.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+from tenacity import wait_exponential, Retrying, stop_after_attempt
+from dotenv import load_dotenv
+import google.generativeai as genai
+from groq import Groq
+import instructor
+from openai import OpenAI
+from cerebras.cloud.sdk import Cerebras
+from limits import storage, strategies, parse
+from typing import List, TypedDict, Union, Annotated, Dict, Any, Tuple
+import time
+from instructor.exceptions import InstructorRetryException
+memory_storage = storage.MemoryStorage()
+moving_window = strategies.MovingWindowRateLimiter(memory_storage)
+rate_limit = parse("10/minute")
+MODEL = 'gemini-1.5-flash-latest'
+MODEL_FAST = 'gemini-1.5-flash-latest'
+MODEL_RAG = 'gemini-1.5-flash-latest'
+# Global variable to track LLM usage
+_LLM_USAGE = {  MODEL: {"input_tokens": 0, "output_tokens": 0},
+                MODEL_FAST: {"input_tokens": 0, "output_tokens": 0},
+                MODEL_RAG: {"input_tokens": 0, "output_tokens": 0}}
+_LLM_USAGE_SPLIT = []
+def get_llm_usage():
+    print(_LLM_USAGE)
+    print(_LLM_USAGE_SPLIT)
+    # Calculate total usage per function
+    function_totals = {}
+    for entry in _LLM_USAGE_SPLIT:
+        fn = entry['function']
+        if fn not in function_totals:
+            function_totals[fn] = {'total_input': 0, 'total_output': 0}
+        function_totals[fn]['total_input'] += entry['input_usage']
+        function_totals[fn]['total_output'] += entry['output_usage']
+    return _LLM_USAGE, _LLM_USAGE_SPLIT, function_totals
+load_dotenv()
+LLM_TYPE = 'google'
+def get_llm_instructor():
+    if LLM_TYPE == 'groq':
+        return instructor.from_groq(Groq(api_key=os.environ["GROQ_API_KEY"]), mode=instructor.Mode.TOOLS)
+    elif LLM_TYPE == 'openrouter':
+        return instructor.from_openai(OpenAI(api_key=os.getenv("OPENROUTER_API_KEY"), base_url="https://openrouter.ai/api/v1"), mode=instructor.Mode.MD_JSON)
+    elif LLM_TYPE == 'cerebras':
+        return instructor.from_cerebras(Cerebras(api_key = os.environ['CEREBRAS_API_KEY']), mode = instructor.Mode.CEREBRAS_JSON)
+    elif LLM_TYPE == 'google':
+        return instructor.from_gemini(client=genai.GenerativeModel(model_name="models/gemini-1.5-flash-latest",
+                                                                   generation_config=genai.configure(api_key= os.environ['GEMINI_API_KEY'])),
+                                                                    mode=instructor.Mode.GEMINI_JSON)
+def call_llm(instructions: str, context: dict, response_model: Any, model_type:str = 'slow', additional_messages: List[Dict[str, str]] = None, logging_fn = 'default') -> Any:
+        """Standardizes LLM calls with optional retries."""
+        messages = [{"role": "system", "content": instructions}]
+        if additional_messages:
+            messages.extend(additional_messages)
+        while not moving_window.test(rate_limit):
+            time.sleep(0.1)
+        model = MODEL_RAG if model_type == 'rag' else (MODEL if model_type == 'slow' else MODEL_FAST)
+        try:
+            client = get_llm_instructor()
+            if LLM_TYPE == 'google':
+                response, completion = client.chat.completions.create_with_completion(
+                            messages=messages,
+                            context=context,
+                            response_model=response_model
+                        )
+            else:
+                response, completion = client.chat.completions.create_with_completion(
+                            model=model,
+                            messages=messages,
+                            temperature=0.5,
+                            context=context,
+                            max_retries=Retrying(stop = stop_after_attempt(2), wait= wait_exponential(multiplier=1.5, min=10, max=60)),
+                            response_model=response_model
+                        )
+        except InstructorRetryException as e:
+            print(e)
+            while not moving_window.test(rate_limit):
+                time.sleep(0.1)
+            def retry_callback(retry_state):
+                # Increase temperature on each retry
+                print('retrying....')
+                new_temp = 0.1 + (retry_state.attempt_number * 0.2)
+                return max(0.1, min(0.9, new_temp))  # Keep between 0.1 and 0.9
+            if LLM_TYPE == 'google':
+                response, completion = client.chat.completions.create_with_completion(
+                                messages=messages,
+                                context=context,
+                                response_model=response_model,
+                                max_retries=Retrying(
+                                    stop=stop_after_attempt(3),
+                                    wait=wait_exponential(multiplier=1.5, min=10, max=60),
+                                    before=retry_callback
+                                )
+                            )
+            else:
+                response, completion = client.chat.completions.create_with_completion(
+                        model=model,
+                        messages=messages,
+                        context=context,
+                        response_model=response_model,
+                        max_retries=3
+                    )
+            # Update usage statistics
+        usage = completion.usage_metadata if LLM_TYPE == 'google' else completion.usage
+        input_tokens = usage.prompt_token_count if LLM_TYPE == 'google' else usage.prompt_tokens
+        output_tokens = usage.candidates_token_count if LLM_TYPE == 'google' else usage.completion_tokens
+        _LLM_USAGE[model]['input_tokens'] += input_tokens
+        _LLM_USAGE[model]['output_tokens'] += output_tokens
+        _LLM_USAGE_SPLIT.append({
+            'function': logging_fn,
+            'input_usage': input_tokens,
+            'output_usage': output_tokens
+        })
+        return response
+if __name__ ==  "__main__":
+    class ResponseModel(TypedDict):
+        answer: str
+    instructions = "What are the key differences between Glean Search and MS Copilot?"
+    context = {}
+    response_model = ResponseModel
+    print(call_llm(instructions, context, response_model))

prompts.py ADDED Viewed

	@@ -0,0 +1,282 @@

+FIND_SEARCH_TERMS_INSTRUCTIONS = """
+                    You are writing a {{report_type}} report on the following topic:
+                    {{original_query}}
+                    Report synopsis:
+                    {{report_synopsis}}
+                    You MUST provide exactly 5 search queries to search for information to write this report.
+                    The search queries should allow you to get a breadth of information related to the topic.
+                    Make sure the queries are specific enough to find high-quality, relevant sources."""
+GENERATE_REPORT_OUTLINE_INSTRUCTIONS = """You are an expert technical writer, helping to plan a {{report_type}} report.
+                        Your goal is to generate the outline of the sections of the report with a maximum of {{num_sections}} sections.
+                        The overall topic of the report is:
+                        {{topic}}
+                        Use the following roundtable discussion to generate the outline of the report.
+                        {{discussion}}
+                        This is the expectation of the reader from the report:
+                        {{context}}
+                        Now, generate the {{num_sections}} sections of the report. Each section should have the following fields:
+                        - Name - Name for this section of the report.
+                        - Description - what needs to be covered in this section?
+                        - Subsections - titles of the subsections if any.
+                        - Content - The content of the section, which you will leave blank for now.
+                        Ignore the Introduction and Conclusion sections. Respond in JSON format"""
+QUALITY_CHECKER_INSTRUCTIONS = """You are an experienced Wikipedia writer and want to edit a specific section of a page titled:
+                        {{title_description}}
+                        Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
+                        Now, you are chatting with an expert to get information. Ask good questions, one at a time, to get more useful information.
+                        Please **ONLY** ask one question at a time and don't ask what you have asked before.\
+                        Your questions should be related to {{title_description}}
+                        Be comprehensive and curious, gaining as much unique insight from the expert as possible.\
+                        Stay true to your specific perspective:
+                        {{persona}}
+                        Guidelines:
+                        - Do not introduce yourself or your role in the conversation
+                        - No need to thank the expert for their answers, just ask your next question.
+                        - Respond in JSON format only
+                    """
+WARM_START_DISCUSSION_INSTRUCTIONS = """You are an experienced Wikipedia writer and want to edit a specific section of a page titled:
+                        {{title_description}}
+                        Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
+                        Now, you are chatting with an expert to get information.
+                        Your questions should be related to {{title_description}}
+                        Be comprehensive and curious, gaining as much unique insight from the expert as possible.
+                        Stay true to your specific perspective:
+                        {{persona}}
+                        Contiue the following discussion:
+                        Guidelines:
+                        - Do not introduce yourself or your role in the conversation
+                        - No need to thank the expert for their answers, just ask your next question.
+                        - Respond in JSON format only
+                    """
+#                        Based on your focus, frame your questions so that you get info on the following:
+#                        {{key_questions}}
+IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS = """You are an expert wikipedia writer who can use information effectively.
+                     Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
+                     Stay true to your persona and perspective:
+                        {{persona}}
+                     You are chatting with an expert who wants\
+                     to write a report on the topic you know : {{section_topic}}
+                     Experts Question : {{expert_question}}
+                     Generate 3 google search queries to find content that answers the experts question."""
+CREATE_SYNOPSIS_INSTRUCTIONS = """ You are a marketer for a publishing company and you are tasked with creating a synopsis for a {{report_type}} report.
+                                 Topic: {{topic}}
+                                 The reader wants some key questions answered : {{key_questions}}
+                                 Write a synopsis of the report in 5-6 sentences, so the reader knows what to expect.
+                              """
+IMPORVE_CONTENT_ANSWER_QUERY_INSTRUCTION = """You are an expert wikipedia writer who can use information effectively.
+                     Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
+                     Stay true to your persona and perspective:
+                        {{persona}}
+                     You are chatting with an expert who wants\
+                     to write a report on the topic you know : {{section_topic}}
+                     Question : {{expert_question}}
+                     Search Results : {{search_results}}
+                     Response Guidelines:
+                     Make your response as informative as possible and make sure every sentence is supported by the gathered information.
+                     If the search results  is not directly related to the [Topic] and [Question], provide the most relevant answer you can based on the available information, and explain any limitations or gaps.
+                     You MUST use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3].") referring to the search results.
+                     Do NOT list the sources at the end, but you need to cite the search results in your response.
+                     Your response should not exceed 150-200 words.
+                     Here's an example of how you must respond:
+                     <example>:
+                     Response: The James Webb Space Telescope (JWST) has revolutionized our understanding of the universe by capturing infrared light, allowing scientists to see the earliest galaxies formed after the Big Bang [1][2][5]. It also provides unparalleled clarity for studying exoplanet atmospheres and stellar formation [3][4].
+                     Search Results:
+                     [1] Title: Webb’s First Images Unveil the Cosmos in Unprecedented Detail
+                     Snippet: NASA's James Webb Space Telescope captures images of ancient galaxies formed just 200 million years after the Big Bang. Its ability to detect faint infrared light gives scientists new insights into the early universe.
+                     URL: XYZ
+                     [2] Title: The Science Behind Webb: Seeing the Unseen
+                     Snippet: Webb’s infrared instruments allow it to pierce through cosmic dust and gas, providing detailed views of star and planet formation that were previously obscured.
+                     URL: XYZ
+                     [3] Title: A Closer Look at Exoplanets with JWST
+                     Snippet: The James Webb Space Telescope offers an unprecedented ability to analyze exoplanet atmospheres, identifying key molecules like water vapor and methane that could indicate potential habitability.
+                     URL: XYZ
+                     [4] Title: Stellar Nurseries Revealed: Webb’s Role in Understanding Star Formation
+                     Snippet: Webb has provided high-resolution images of stellar nurseries, helping scientists understand how stars form and evolve in various cosmic environments.
+                     URL: XYZ
+                     [5] Title: How Webb's Infrared Technology Changes Our View of Space
+                     Snippet: Unlike the Hubble, Webb operates primarily in the infrared spectrum, which is crucial for detecting the faintest and most distant objects in the universe.
+                     URL: XYZ
+                     .......
+                     </example>
+                     Respond in JSON format without markdown
+                     """
+GENERATE_ROUNDTABLE_PERSONAS_INSTRUCTIONS = """You need to select a diverse (and distinct) group of max {{num_personas}} experts who will participate in a roundtable discussion on the topic : {{topic}}
+                        The experts will help the audience understand unique perspectives that need to be covered in the report.
+                        Its important to go broad so that you can get different perspectives on the topic.\
+                        For example, if the discussion focus is about a recent event at a specific university, consider inviting students, faculty members, journalists covering the event, university officials, and local community members.
+                        You can use the provided context for inspiration. For each expert, add a 2-3 line description of what they will focus on and how they will make the roundtable discussion interesting for the audience.
+                        Search Engine snippets of the topic:
+                        {{context}}
+                        Base your personas on the following expectations from the audience:
+                        {{report_synopsis}}
+                      """
+ROUNDTABLE_DISCUSSION_INSTRUCTIONS = """You are {{persona_name}}, a {{persona_role}} working at {{persona_affiliation}}. You are participating in a roundtable discussion on the topic: {{topic}}
+                     Your focus area  is : {{persona_focus}}
+                     Along with you, the following experts are participating in the roundtable discussion:
+                     {{personas}}
+                     Here is the discussion so far:
+                     {{discussion}}
+                     Its your turn to contribute to the discussion, response with a short answer not exceeding 200 words. Also ask the next expert a question to keep the discussion going.
+                     """
+GENERATE_PERSONAS_INSTRUCTIONS = """You need to select a diverse (and distinct) group of max {{num_personas}} experts who will work together to create a comprehensive {{type_of_report}} report on the topic : {{topic}}
+                        The experts will help you understand the unique questions and perspectives that need to be covered in the report.
+                        Its important to go broad so that you can get different perspectives on the topic.\
+                        For example, if the discussion focus is about a recent event at a specific university, consider inviting students, faculty members, journalists covering the event, university officials, and local community members.
+                        You can use the provided context for inspiration. For each expert, add a 2-3 line description of what they will focus on and how they will help.
+                        The expectation from the reader is a report with the following synopsis:
+                        {{report_synopsis}}
+                      """
+ORGANIZE_MINDMAP_INSTRUCTIONS = """You are a seasoned research assistant tasked with organizing the key concepts and ideas as a mindmap on the following topic:
+                     {{topic}}
+                     Here is your mindmap so far:
+                     {{mindmap}}
+                     Condense this discussion into the mindmap.
+                     {{discussion}}
+                     You can choose to create_new_topic or insert_into_existing_topic.
+                     If creating a new topic, provide a name for the new topic along with the subtopics
+                     If inserting into an existing topic, provide the name of the existing topic and the subtopics to insert.
+                     Respond in JSON format only.
+                      """
+REORGANIZE_TOPIC_INSTRUCTIONS =  """
+                     Given the current mindmap structure, reorganize it into a more balanced and coherent structure.
+                     Each topic should have no more than {{max_subtopics}} subtopics.
+                     {{mindmap}}
+                     Organize these subtopics into 2-3 new topics that are more focused and manageable.
+                     Return:
+                     1. The new topics with their subtopics
+                     2. Mapping between the old structure to the new structure. Eg: {old_topic/old_sub_topic : new_topic/new_sub_topic}
+                     """
+WRITE_TOPIC_SUMMARY_INSTRUCTIONS = """You are a seasoned research assistant tasked with writing a subsection based on the following discussion:
+                     {{discussion}}
+                     References:
+                     {{references}}
+                     Section Title: {{section_title}}
+                     Here's a synoposis of the full report (you are only writing one section of it). {{synopsis}} Keep this theme in mind when writing the summary.
+                     Give an title to the summary you generate.
+                     Make your response as informative as possible and make sure every sentence is supported by the gathered information.
+                     You MUST use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3].") referring to the search results.
+                     Do NOT list the sources at the end, but you need to cite the search results in your response.
+                     """
+WRITE_SECTION_INSTRUCTIONS = """
+                              You are a seasoned wikipedia writer tasked with writing a section of a {{report_type}} report on: {{topic}}
+                              You have gathered information on multiple topics each with citations.
+                              {{gathered_info}}
+                              Now you need to write the section on :
+                              Section Title: {{section_title}}
+                              Section Description: {{section_description}}
+                              Readers expect to answer the following questions after reading this section: {{section_questions}}
+                              Here's a synoposis of the full report, of which you are writing the section: {{section_title}}:
+                              {{synopsis}}
+                              Keep this theme in mind when writing the summary.
+                              Guidelines for Section Writing:
+                              1. Keep the citations and reference numbers as-is. Do NOT change the reference numbers.
+                              2. In case you want to merge information, merge the citations as well (eg: sentenceA[2], sentenceB[3] --> the citation should be [2][3])
+                              3. Keep the langugage simple and easy to understand for a layman.
+                              4. Create subsections as required.
+                              5. The whole section should not be more than {{section_length}} words, including the subsections.
+                              5. Maintain markdown format with headers, bold, italics, newline and bullet points - visual styling is important for the reader.
+                              Write the content for the section : {{section_title}}
+                              """

research_manager.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import ui
+from typing import List
+from pydantic import BaseModel, Field
+import time
+import gradio as gr
+from llm_config import call_llm, get_llm_usage
+import prompts
+from colorama import Fore, Style
+# Add these imports at the top
+from search import fetch_search_results
+from improve_content import ImproveContent
+import re
+# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# logger = logging.getLogger(__name__)
+class Section(BaseModel):
+    name: str = Field(
+        description="Name for this section of the report.",
+    )
+    description: str = Field(
+        description="Brief overview of the main topics and concepts to be covered in this section.",
+    )
+    questions: List[str] = Field(
+        description="Key Questions to answer in this section."
+    )
+    content: str = Field(
+        description="The content of the section."
+    )
+class Sections(BaseModel):
+    sections: List[Section] = Field(
+        description="Sections of the report.",
+    )
+    @property
+    def as_str(self) -> str:
+        subsections = "\n\n".join(
+            f"## {section.name}\n\n-{section.description}\n\n- Questions: {'\n\n'.join(section.questions)}\n\n- Content: {section.content}\n"
+            for section in self.sections or []
+        )
+        return subsections
+    def print_sections(self) -> str:
+        return '\n\n'.join([s.content for s in self.sections])
+class ResearchArea(BaseModel):
+    area : str = Field(..., title="Research Area")
+    search_terms : str = Field(..., title = "Search Term", description =  "Search query that will help you find information")
+class ResearchFocus(BaseModel):
+    areas : List[ResearchArea] = Field(..., title="Research Areas")
+class RelevantSearchResults(BaseModel):
+    relevant_search_results : List[int] = Field(..., title="Relevant Search Results", description="The position of the search result in the search results list")
+    reasoning : List[str] = Field(..., title="Reasoning", description="Reasoning for selecting the search results")
+class SearchTerm(BaseModel):
+    query : str = Field(..., title="Search Query")
+    #time_range : str = Field(..., title="Time Range", description="d/w/m/y/none")
+class SearchTermsList(BaseModel):
+    queries : List[str] = Field(..., title="Search Terms as a list")
+class Editor(BaseModel):
+    name: str = Field(
+        description="Name of the editor.",
+    )
+    affiliation: str = Field(
+        description="Primary affiliation of the editor.",
+    )
+    role: str = Field(
+        description="Role of the editor in the context of the topic.",
+    )
+    focus: str = Field(
+        description="Description of the editor's focus area, concerns and how they will help.",
+    )
+    @property
+    def persona(self) -> str:
+        return f"\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.focus}\n"
+class Perspectives(BaseModel):
+    editors: List[Editor] = Field(
+        description="Comprehensive list of editors with their roles and affiliations.",
+    )
+class ReportSynopsis(BaseModel):
+    synopsis: str= Field(..., title="Report Synopsis", description="A synopsis talking about what the reader can expect")
+class SectionContent(BaseModel):
+    content: str = Field(..., title="Section Content", description="The content of the section")
+class ResearchManager:
+    """Manages the research process including analysis, search, and documentation"""
+    def __init__(self, research_task):
+        self.use_existing_outline = True
+        self.research_task = research_task
+        self.report_synopsis = ''
+        self.personas = ''
+        self.gradio_report_outline = ''
+        self.task_status = {
+            'synopsis_draft' : {"name": "Creating synopsis of the report...", "status": "pending"},
+            'gathering_info' : {"name": "Gathering Info on the topic...", "status": "pending"},
+            'running_searches' : {"name": "Run search...", "status": "pending"},
+            'mock_discussion' : {"name": "Conducting mock discussions...", "status": "pending"},
+            'generating_outline': {"name": "Generating a draft outline...", "status": "pending"},
+        }
+    def extract_citation_info(self,text):
+        """
+        Extract citation number and URL from citation text
+        """
+        references = {}
+        for ref in text:
+            # Find citation number
+            citation_match = re.search(r'\[(\d+)\]', ref)
+            citation_number = citation_match.group(1) if citation_match else None
+            # Find URL
+            url_match = re.search(r'URL: (https?://\S+)', ref)
+            url = url_match.group(1) if url_match else None
+            references[citation_number] = {
+                'url': url,
+                'reference_text': ref
+            }
+        return references
+    def section_writer(self, section: Section):
+        """Given an outline of a section, generate search queries,
+        perform searches and generate the section content"""
+        improve_content = ImproveContent(section.name,
+                                         section.description,
+                                         section.questions,
+                                         self.personas.editors
+                                         )
+        improved_content = yield from improve_content.create_and_run_interview(self.task_status, self.update_gradio)
+        content, references = improve_content.generate_final_section(self.report_synopsis)
+        self.task_status[section.name]["name"] = "Writing Section: " + section.name
+        yield from self.update_gradio()
+        ui.system_update(f"Writing Section: {section.name}")
+        section_content = call_llm(
+                instructions=prompts.WRITE_SECTION_INSTRUCTIONS,
+                model_type='slow',
+                context={"section_description": section.description,
+                         "gathered_info" : '\n\n'.join(content),
+                        "topic": self.research_task['topic'],
+                        "section_title" : section.name,
+                        "synopsis" : self.report_synopsis,
+                        "section_questions" : '\n'.join(section.questions),
+                        'report_type': self.research_task['report_type'],
+                        'section_length': self.research_task['section_length']},
+                response_model=SectionContent,
+                logging_fn='write_section_instructions'
+            )
+        #references = '\n\n'.join(references)
+        references_dict = self.extract_citation_info(references.split('\n\n'))
+        #Replacing citations with [2,3,4] format with [2][3][4]
+        cited_references_raw = re.findall(r'\[(\d+(?:,\s*\d+)*)\]', section_content.content)
+        for group in cited_references_raw:
+            nums_list = group.split(',')
+            new_string = ''.join(f'[{n.strip()}]' for n in nums_list)
+            old_string = f'[{group}]'
+            section_content.content = section_content.content.replace(old_string, new_string)
+        parsed_cited_references = []
+        for ref_group in cited_references_raw:
+            for ref_no in ref_group.split(','):
+                parsed_cited_references.append(ref_no.strip())
+        used_references = {}
+        uncited_sources= []
+        for reference_no in parsed_cited_references:
+            reference = references_dict.get(reference_no)
+            if reference:
+                used_references[reference_no] = reference
+            else:
+                print(f"Reference {reference_no} not found")
+                uncited_sources.append(reference_no)
+                section_content.content = section_content.content.replace(f"[{reference_no}]", "[!]")
+        for ref_no, data in used_references.items():
+            if data["url"]:
+                section_content.content = section_content.content.replace(f"[{ref_no}]", f"[[{ref_no}]]({data['url']})")
+        section.content = section_content.content
+        print(section_content.content)
+        self.task_status[section.name]["status"] = "done"
+        yield from self.update_gradio(report_outline_str=self.report_outline.print_sections(), button_disable=False)
+        ui.system_update("Waiting for 5 seconds before next section")
+        time.sleep(5)
+        return section
+    def _generate_report_outline(self):
+        """Use LLM to generate focus areas for research based on the original query"""
+        ui.system_update(f"\nGathering Context..")
+        self.task_status['gathering_info']["status"] = "running"
+        yield from self.update_gradio()
+        queries = call_llm(
+                instructions=prompts.FIND_SEARCH_TERMS_INSTRUCTIONS,
+                model_type='fast',
+                context={
+                    "report_type": self.research_task['report_type'],
+                    "original_query": self.research_task['topic'],
+                    "report_synopsis": self.report_synopsis,
+                },
+                response_model=SearchTermsList,
+                logging_fn='find_search_terms_instructions'
+            )
+        self.task_status['running_searches']["status"] = "running"
+        yield from self.update_gradio()
+        formatted_results, results = yield from fetch_search_results(query=queries.queries,
+                                                                     task_status=self.task_status,
+                                                                     task_name = 'running_searches',
+                                                                    fn = self.update_gradio)
+        self.context = formatted_results
+        self.task_status['running_searches']["status"] = "done"
+        self.task_status['gathering_info']["status"] = "done"
+        self.task_status['mock_discussion']["status"] = "running"
+        yield from self.update_gradio()
+        personas = call_llm(
+                instructions=prompts.GENERATE_ROUNDTABLE_PERSONAS_INSTRUCTIONS,
+                model_type='slow',
+                context={"context": self.context,
+                        "topic": self.research_task['topic'],
+                        "report_synopsis": self.report_synopsis,
+                        'type_of_report': self.research_task['report_type'],
+                        'num_personas': 5},
+                response_model=Perspectives,
+                logging_fn='generate_roundtable_personas_instructions'
+            )
+        self.task_status['mock_discussion']["name"] = "Started discussions..."
+        print(personas)
+        yield from self.update_gradio()
+        improve_content = ImproveContent(self.research_task['topic'],
+                                         "This section will focus on a comprehensive overview of glean",
+                                         self.research_task['key_questions'],
+                                         personas.editors)
+        warm_start_discussion = improve_content.warm_start_discussion()
+        self.task_status['mock_discussion']["name"] = "Mock discussions complete"
+        self.task_status['mock_discussion']["status"] = "done"
+        self.task_status['generating_outline']["status"] = "running"
+        yield from self.update_gradio()
+        ui.system_update("\nGenerating Report Outline..")
+        report_outline = call_llm(
+                        instructions=prompts.GENERATE_REPORT_OUTLINE_INSTRUCTIONS,
+                        model_type='slow',
+                        context={
+                            "report_type": self.research_task['report_type'],
+                            "topic": self.research_task['topic'],
+                            "context": self.context,
+                            "discussion": '\n'.join(warm_start_discussion),
+                            'num_sections': 3
+                        },
+                        response_model=Sections,
+                        logging_fn='generate_report_outline_instructions'
+                    )
+        self.task_status['generating_outline']["status"] = "done"
+        yield from self.update_gradio(report_outline_str=report_outline.as_str)
+        print(report_outline.as_str)
+        return report_outline
+    def validate_outline_with_human(self, report_outline: Sections) -> Sections:
+        """Ask the human feedback and improve the report outline till they say 'OK' """
+        while True:
+            ui.system_update("\nPlease provide feedback on the generated report outline")
+            feedback = ui.get_multiline_input()
+            if feedback.lower() == 'ok':
+                return report_outline
+            ui.system_update("\nImproving the report outline based on your feedback")
+            extract_sections_chain = prompts.IMPROVE_REPORT_OUTLINE_PROMPT | self.llm.with_structured_output(Sections)
+            report_outline = extract_sections_chain.invoke({"topic": self.research_task['topic'], "feedback": feedback, "report_outline": report_outline.as_str})
+            ui.system_output(report_outline.as_str)
+    def create_report_synopsis(self):
+        return call_llm(
+                instructions=prompts.CREATE_SYNOPSIS_INSTRUCTIONS,
+                model_type='fast',
+                context={
+                    "report_type": self.research_task['report_type'],
+                    "topic": self.research_task['topic'],
+                    "key_questions": self.research_task['key_questions'],
+                },
+                response_model=ReportSynopsis,
+                logging_fn='create_synopsis_instructions'
+        )
+    def update_gradio(self, report_outline_str = '', button_disable = False):
+        if report_outline_str != '':
+            self.gradio_report_outline = report_outline_str
+        yield [gr.update(interactive=button_disable), self.update_ui(), self.gradio_report_outline]
+    def start_research(self):
+        """Main research loop with comprehensive functionality"""
+        self.task_status['synopsis_draft']["status"] = "running"
+        yield from self.update_gradio()
+        ui.system_update(f"Starting research on: {self.research_task['topic']}")
+        ui.system_update("\nGenerating report outline")
+        self.report_synopsis = self.create_report_synopsis()
+        self.task_status['synopsis_draft']["status"] = "done"
+        yield from self.update_gradio()
+        self.report_outline = yield from self._generate_report_outline()
+        #self.report_outline = self.validate_outline_with_human(self.report_outline)
+        for section in self.report_outline.sections:
+            self.task_status[section.name] = {"name": f"Starting Section: {section.name}", "status": "pending"}
+        yield from self.update_gradio()
+        ui.system_update("\nGenerating personas for writing sections")
+        self.personas = call_llm(
+                instructions=prompts.GENERATE_PERSONAS_INSTRUCTIONS,
+                model_type='slow',
+                context={
+                        "topic": self.research_task['topic'],
+                        "report_synopsis": self.report_synopsis,
+                        'type_of_report': self.research_task['report_type'],
+                        'num_personas': 2},
+                response_model=Perspectives,
+                logging_fn='generate_personas_instructions'
+            )
+        ui.system_update("\nWriting Sections....")
+        for section in self.report_outline.sections:
+                self.task_status[section.name]["status"] = "running"
+                yield from self.update_gradio()
+                ui.system_sub_update(f"\nWriting Section: {section.name}")
+                section = yield from self.section_writer(section)
+        for section in self.report_outline.sections:
+            print(section.content)
+    def update_ui(self):
+        completed_tasks = sum(1 for _, task in self.task_status.items() if task["status"] == "done")
+        total_tasks = len(self.task_status)
+        progress_percentage = int((completed_tasks / total_tasks) * 100)
+        html_output = f"""
+        <style>
+        .progress-bar-container {{
+          width: 100%;
+          background-color: #f3f3f3;
+          border-radius: 5px;
+          overflow: hidden;
+          margin-bottom: 20px;
+        }}
+        .progress-bar {{
+          height: 20px;
+          width: {progress_percentage}%;
+          background-color: #3498db;
+          transition: width 0.3s;
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          color: white;
+          font-weight: bold;
+          font-size: 12px;
+        }}
+        .progress-task {{
+          display: flex;
+          align-items: center;
+          gap: 10px;
+          font-family: 'Helvetica Neue', Arial, sans-serif;
+          margin: 5px 0;
+          font-size: 14px;
+          font-weight: 500;
+          color: #333;
+        }}
+        .progress-task .task-name {{
+          flex-grow: 1;
+        }}
+        .progress-task .icon {{
+          width: 20px;
+          height: 20px;
+        }}
+        .loading-circle {{
+          width: 15px;
+          height: 15px;
+          border: 3px solid #ccc;
+          border-top: 3px solid #3498db;
+          border-radius: 50%;
+          animation: spin 1s linear infinite;
+        }}
+        @keyframes spin {{
+          0% {{ transform: rotate(0deg); }}
+          100% {{ transform: rotate(360deg); }}
+        }}
+        .done-icon {{
+          color: #2ecc71;
+          font-size: 16px;
+        }}
+        .checkbox {{
+          width: 15px;
+          height: 15px;
+          border: 1px solid #ccc;
+          display: inline-block;
+          margin-right: 10px;
+        }}
+        .milestone {{
+          display: inline-block;
+          width: 10px;
+          height: 10px;
+          background-color: #ccc;
+          border-radius: 50%;
+          margin: 0 5px;
+        }}
+        .milestone.completed {{
+          background-color: #2ecc71;
+        }}
+        </style>
+        <div class='progress-bar-container'>
+          <div class='progress-bar'>{progress_percentage}%</div>
+        </div>
+        <div style='display: flex; justify-content: center; margin-bottom: 20px;'>
+        {''.join([f"<div class='milestone {'completed' if i < completed_tasks else ''}'></div>" for i in range(total_tasks)])}
+        </div>
+        """
+        for _, task in self.task_status.items():
+            if task["status"] == "running":
+                icon = "<div class='loading-circle'></div>"
+            elif task["status"] == "done":
+                icon = "<span class='done-icon'>&#10003;</span>"
+            else:
+                icon = "<div class='checkbox'></div>"
+            html_output += f"<div class='progress-task'><span class='icon'>{icon}</span><span class='task-name'>{task['name']}</span></div>"
+        return html_output

search.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from tavily import TavilyClient
+import ui
+from typing import List, Dict
+import os
+from dotenv import load_dotenv
+load_dotenv()
+tavily_search = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+def format_search_results(results: List[Dict]) -> str:
+    formatted_results = []
+    for result in results:
+        formatted_result = ''
+        formatted_result = f"[{result.get('number', '')}] Title: {result.get('title', 'N/A')}\n"
+        formatted_result += f"   Snippet: {result.get('body', '')}\n"
+        formatted_result += f"   URL: {result.get('href', 'N/A')}\n\n"
+        formatted_results.append(formatted_result)
+    return "\n".join(formatted_results)
+def fetch_search_results(query: List[str], task_status, task_name, fn):
+    results = []
+    for q in query:
+        task_status[task_name]["name"] = "Searching for: " + q
+        yield from fn()
+        ui.system_sub_update(f"Searching for: {q}")
+        try:
+            result = tavily_search.search(q)
+            result = result["results"][:3]
+            result = [{'title': res['title'], 'body': res['content'], 'href': res['url']} for res in result]
+            results.extend(result)
+        except:
+            result = tavily_search.search(q)
+            result = result["results"][:3]
+            result = [{'title': res['title'], 'body': res['content'], 'href': res['url']} for res in result]
+            results.extend(result)
+    results = [{'number' : i+1, **result } for i, result in enumerate(results)]
+    formatted_results = format_search_results(results)
+    return formatted_results, results
+if __name__ == "__main__":
+    fetch_search_results(['gitlab', 'github'])

ui.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from colorama import Style, Fore
+import sys
+import gradio as gr
+import tty
+import termios
+def generate_progress_html(percent):
+    return f"""
+    <div class="progress-bar" style="width: 100%; background-color: #e0e0e0; border-radius: 10px; overflow: hidden; position: relative;">
+        <div class="progress-bar-inner" style="height: 20px; background: linear-gradient(90deg, #76c7c0, #4ca1af); width: {percent}%; border-radius: 10px; transition: width 0.3s ease;">
+            <div class="progress-text" style="position: absolute; width: 100%; top: 0; left: 0; height: 100%; display: flex; align-items: center; justify-content: center; color: #fff; font-weight: bold;">{percent}%</div>
+        </div>
+    </div>
+    """
+def get_char():
+    """Read a single character from standard input without echo."""
+    fd = sys.stdin.fileno()
+    old_settings = termios.tcgetattr(fd)
+    try:
+        tty.setraw(sys.stdin.fileno())
+        char = sys.stdin.read(1)
+    finally:
+        termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+    return char
+def get_multiline_input():
+    """Ask the human user_input till they press Ctrl+B."""
+    print(Fore.BLUE + 'Enter your multiline text and press Ctrl+B when you are done:')
+    user_input = []
+    current_line = []
+    line_num = 1
+    print(f"{line_num}> ", end='', flush=True)
+    while True:
+        char = get_char()
+        if ord(char) == 2:  # Ctrl+B
+            break
+        elif char == '\r' or char == '\n':  # Enter key
+            line = ''.join(current_line)
+            user_input.append(line)
+            current_line = []
+            line_num += 1
+            print(f"\n{line_num}> ", end='', flush=True)
+        elif char == '\x7f':  # Backspace
+            if current_line:
+                current_line.pop()
+                # Move cursor back, overwrite with space, move back again
+                print('\b \b', end='', flush=True)
+        else:
+            current_line.append(char)
+            print(char, end='', flush=True)
+    print('\n\n')
+    return '\n'.join(user_input)
+def system_update(update_message):
+    print(Style.RESET_ALL+ Fore.YELLOW  + f"{update_message}")
+def system_sub_update(update_message):
+    print(Style.RESET_ALL+ Fore.CYAN  + f"{update_message}")
+def system_output(update_message):
+    print(Style.RESET_ALL+ Fore.GREEN +  + f"{update_message}")
+def system_error(update_message):
+    print(Style.RESET_ALL + Fore.RED + f"{update_message}")