import ui from typing import List from pydantic import BaseModel, Field import time import gradio as gr from llm_config import call_llm, get_llm_usage import prompts from colorama import Fore, Style # Add these imports at the top from search import fetch_search_results from improve_content import ImproveContent import re # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # logger = logging.getLogger(__name__) class Section(BaseModel): name: str = Field( description="Name for this section of the report.", ) description: str = Field( description="Brief overview of the main topics and concepts to be covered in this section.", ) questions: List[str] = Field( description="Key Questions to answer in this section." ) content: str = Field( description="The content of the section." ) class Sections(BaseModel): sections: List[Section] = Field( description="Sections of the report.", ) @property def as_str(self) -> str: subsections = "\n\n".join( f"## {section.name}\n\n-{section.description}\n\n- Questions: {'\n\n'.join(section.questions)}\n\n- Content: {section.content}\n" for section in self.sections or [] ) return subsections def print_sections(self) -> str: return '\n\n'.join([s.content for s in self.sections]) class ResearchArea(BaseModel): area : str = Field(..., title="Research Area") search_terms : str = Field(..., title = "Search Term", description = "Search query that will help you find information") class ResearchFocus(BaseModel): areas : List[ResearchArea] = Field(..., title="Research Areas") class RelevantSearchResults(BaseModel): relevant_search_results : List[int] = Field(..., title="Relevant Search Results", description="The position of the search result in the search results list") reasoning : List[str] = Field(..., title="Reasoning", description="Reasoning for selecting the search results") class SearchTerm(BaseModel): query : str = Field(..., title="Search Query") #time_range : str = Field(..., title="Time Range", description="d/w/m/y/none") class SearchTermsList(BaseModel): queries : List[str] = Field(..., title="Search Terms as a list") class Editor(BaseModel): name: str = Field( description="Name of the editor.", ) affiliation: str = Field( description="Primary affiliation of the editor.", ) role: str = Field( description="Role of the editor in the context of the topic.", ) focus: str = Field( description="Description of the editor's focus area, concerns and how they will help.", ) @property def persona(self) -> str: return f"\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.focus}\n" class Perspectives(BaseModel): editors: List[Editor] = Field( description="Comprehensive list of editors with their roles and affiliations.", ) class ReportSynopsis(BaseModel): synopsis: str= Field(..., title="Report Synopsis", description="A synopsis talking about what the reader can expect") class SectionContent(BaseModel): content: str = Field(..., title="Section Content", description="The content of the section") class ResearchManager: """Manages the research process including analysis, search, and documentation""" def __init__(self, research_task): self.use_existing_outline = True self.research_task = research_task self.report_synopsis = '' self.personas = '' self.gradio_report_outline = '' self.task_status = { 'synopsis_draft' : {"name": "Creating synopsis of the report...", "status": "pending"}, 'gathering_info' : {"name": "Gathering Info on the topic...", "status": "pending"}, 'running_searches' : {"name": "Run search...", "status": "pending"}, 'mock_discussion' : {"name": "Conducting mock discussions...", "status": "pending"}, 'generating_outline': {"name": "Generating a draft outline...", "status": "pending"}, } def extract_citation_info(self,text): """ Extract citation number and URL from citation text """ references = {} for ref in text: # Find citation number citation_match = re.search(r'\[(\d+)\]', ref) citation_number = citation_match.group(1) if citation_match else None # Find URL url_match = re.search(r'URL: (https?://\S+)', ref) url = url_match.group(1) if url_match else None references[citation_number] = { 'url': url, 'reference_text': ref } return references def section_writer(self, section: Section): """Given an outline of a section, generate search queries, perform searches and generate the section content""" improve_content = ImproveContent(section.name, section.description, section.questions, self.personas.editors ) improved_content = yield from improve_content.create_and_run_interview(self.task_status, self.update_gradio) content, references = improve_content.generate_final_section(self.report_synopsis) self.task_status[section.name]["name"] = "Writing Section: " + section.name yield from self.update_gradio() ui.system_update(f"Writing Section: {section.name}") section_content = call_llm( instructions=prompts.WRITE_SECTION_INSTRUCTIONS, model_type='slow', context={"section_description": section.description, "gathered_info" : '\n\n'.join(content), "topic": self.research_task['topic'], "section_title" : section.name, "synopsis" : self.report_synopsis, "section_questions" : '\n'.join(section.questions), 'report_type': self.research_task['report_type'], 'section_length': self.research_task['section_length']}, response_model=SectionContent, logging_fn='write_section_instructions' ) #references = '\n\n'.join(references) references_dict = self.extract_citation_info(references.split('\n\n')) #Replacing citations with [2,3,4] format with [2][3][4] cited_references_raw = re.findall(r'\[(\d+(?:,\s*\d+)*)\]', section_content.content) for group in cited_references_raw: nums_list = group.split(',') new_string = ''.join(f'[{n.strip()}]' for n in nums_list) old_string = f'[{group}]' section_content.content = section_content.content.replace(old_string, new_string) parsed_cited_references = [] for ref_group in cited_references_raw: for ref_no in ref_group.split(','): parsed_cited_references.append(ref_no.strip()) used_references = {} uncited_sources= [] for reference_no in parsed_cited_references: reference = references_dict.get(reference_no) if reference: used_references[reference_no] = reference else: print(f"Reference {reference_no} not found") uncited_sources.append(reference_no) section_content.content = section_content.content.replace(f"[{reference_no}]", "[!]") for ref_no, data in used_references.items(): if data["url"]: section_content.content = section_content.content.replace(f"[{ref_no}]", f"[[{ref_no}]]({data['url']})") section.content = section_content.content print(section_content.content) self.task_status[section.name]["status"] = "done" yield from self.update_gradio(report_outline_str=self.report_outline.print_sections(), button_disable=False) ui.system_update("Waiting for 5 seconds before next section") time.sleep(5) return section def _generate_report_outline(self): """Use LLM to generate focus areas for research based on the original query""" ui.system_update(f"\nGathering Context..") self.task_status['gathering_info']["status"] = "running" yield from self.update_gradio() queries = call_llm( instructions=prompts.FIND_SEARCH_TERMS_INSTRUCTIONS, model_type='fast', context={ "report_type": self.research_task['report_type'], "original_query": self.research_task['topic'], "report_synopsis": self.report_synopsis, }, response_model=SearchTermsList, logging_fn='find_search_terms_instructions' ) self.task_status['running_searches']["status"] = "running" yield from self.update_gradio() formatted_results, results = yield from fetch_search_results(query=queries.queries, task_status=self.task_status, task_name = 'running_searches', fn = self.update_gradio) self.context = formatted_results self.task_status['running_searches']["status"] = "done" self.task_status['gathering_info']["status"] = "done" self.task_status['mock_discussion']["status"] = "running" yield from self.update_gradio() personas = call_llm( instructions=prompts.GENERATE_ROUNDTABLE_PERSONAS_INSTRUCTIONS, model_type='slow', context={"context": self.context, "topic": self.research_task['topic'], "report_synopsis": self.report_synopsis, 'type_of_report': self.research_task['report_type'], 'num_personas': 5}, response_model=Perspectives, logging_fn='generate_roundtable_personas_instructions' ) self.task_status['mock_discussion']["name"] = "Started discussions..." print(personas) yield from self.update_gradio() improve_content = ImproveContent(self.research_task['topic'], "This section will focus on a comprehensive overview of glean", self.research_task['key_questions'], personas.editors) warm_start_discussion = improve_content.warm_start_discussion() self.task_status['mock_discussion']["name"] = "Mock discussions complete" self.task_status['mock_discussion']["status"] = "done" self.task_status['generating_outline']["status"] = "running" yield from self.update_gradio() ui.system_update("\nGenerating Report Outline..") report_outline = call_llm( instructions=prompts.GENERATE_REPORT_OUTLINE_INSTRUCTIONS, model_type='slow', context={ "report_type": self.research_task['report_type'], "topic": self.research_task['topic'], "context": self.context, "discussion": '\n'.join(warm_start_discussion), 'num_sections': 3 }, response_model=Sections, logging_fn='generate_report_outline_instructions' ) self.task_status['generating_outline']["status"] = "done" yield from self.update_gradio(report_outline_str=report_outline.as_str) print(report_outline.as_str) return report_outline def validate_outline_with_human(self, report_outline: Sections) -> Sections: """Ask the human feedback and improve the report outline till they say 'OK' """ while True: ui.system_update("\nPlease provide feedback on the generated report outline") feedback = ui.get_multiline_input() if feedback.lower() == 'ok': return report_outline ui.system_update("\nImproving the report outline based on your feedback") extract_sections_chain = prompts.IMPROVE_REPORT_OUTLINE_PROMPT | self.llm.with_structured_output(Sections) report_outline = extract_sections_chain.invoke({"topic": self.research_task['topic'], "feedback": feedback, "report_outline": report_outline.as_str}) ui.system_output(report_outline.as_str) def create_report_synopsis(self): return call_llm( instructions=prompts.CREATE_SYNOPSIS_INSTRUCTIONS, model_type='fast', context={ "report_type": self.research_task['report_type'], "topic": self.research_task['topic'], "key_questions": self.research_task['key_questions'], }, response_model=ReportSynopsis, logging_fn='create_synopsis_instructions' ) def update_gradio(self, report_outline_str = '', button_disable = False): if report_outline_str != '': self.gradio_report_outline = report_outline_str yield [gr.update(interactive=button_disable), self.update_ui(), self.gradio_report_outline] def start_research(self): """Main research loop with comprehensive functionality""" self.task_status['synopsis_draft']["status"] = "running" yield from self.update_gradio() ui.system_update(f"Starting research on: {self.research_task['topic']}") ui.system_update("\nGenerating report outline") self.report_synopsis = self.create_report_synopsis() self.task_status['synopsis_draft']["status"] = "done" yield from self.update_gradio() self.report_outline = yield from self._generate_report_outline() #self.report_outline = self.validate_outline_with_human(self.report_outline) for section in self.report_outline.sections: self.task_status[section.name] = {"name": f"Starting Section: {section.name}", "status": "pending"} yield from self.update_gradio() ui.system_update("\nGenerating personas for writing sections") self.personas = call_llm( instructions=prompts.GENERATE_PERSONAS_INSTRUCTIONS, model_type='slow', context={ "topic": self.research_task['topic'], "report_synopsis": self.report_synopsis, 'type_of_report': self.research_task['report_type'], 'num_personas': 2}, response_model=Perspectives, logging_fn='generate_personas_instructions' ) ui.system_update("\nWriting Sections....") for section in self.report_outline.sections: self.task_status[section.name]["status"] = "running" yield from self.update_gradio() ui.system_sub_update(f"\nWriting Section: {section.name}") section = yield from self.section_writer(section) for section in self.report_outline.sections: print(section.content) def update_ui(self): completed_tasks = sum(1 for _, task in self.task_status.items() if task["status"] == "done") total_tasks = len(self.task_status) progress_percentage = int((completed_tasks / total_tasks) * 100) html_output = f"""