anirudhs commited on
Commit
8fd59af
·
1 Parent(s): dab3f26

added researcher files

Browse files
Files changed (7) hide show
  1. app.py +72 -4
  2. improve_content.py +233 -0
  3. llm_config.py +146 -0
  4. prompts.py +282 -0
  5. research_manager.py +475 -0
  6. search.py +45 -0
  7. ui.py +74 -0
app.py CHANGED
@@ -1,11 +1,79 @@
 
 
1
  import gradio as gr
 
2
 
 
3
 
4
- def greet(name):
5
- return "Hello " + name + "!"
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
7
 
8
- demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  if __name__ == "__main__":
11
- demo.launch()
 
1
+ from colorama import init, Fore, Style
2
+ from research_manager import ResearchManager
3
  import gradio as gr
4
+ import os
5
 
6
+ def run_research(topic_input, questions_input, section_length, groq_key):
7
 
8
+ research_task = {
9
+ "topic" : topic_input,
10
+ "key_questions" : questions_input.split(','),
11
+ "report_type" : "market study",
12
+ 'section_length' : section_length,
13
+ "guidelines": [
14
+ "The report MUST fully answer all the questions",
15
+ "The report MUST only contain information that can be cited from a URL content",
16
+ "The report DOES NOT contain unverified information and contains only facts",
17
+ ],
18
+ }
19
+ os.environ['GEMINI_API_KEY'] = groq_key
20
 
21
+ # Initialize Research Manager
22
+ research_manager = ResearchManager(research_task)
23
+ yield from research_manager.start_research()
24
 
25
+
26
+
27
+ def main():
28
+ with gr.Blocks(css="""
29
+ .report-container {
30
+ height: 90vh;
31
+ overflow-y: auto;
32
+ border: 1px solid #ddd;
33
+ border-radius: 4px;
34
+ padding: 1rem;
35
+ }
36
+ """) as demo:
37
+ with gr.Row():
38
+ with gr.Column():
39
+ topic_input = gr.Textbox(
40
+ label="Research Topic",
41
+ placeholder="Enter your research topic...",
42
+ value="How is MS copilot performing in the enterprise search market ?"
43
+ )
44
+ questions_input = gr.Textbox(
45
+ label="Key Questions (comma-separated)",
46
+ placeholder="Enter key questions...",
47
+ value="What are the user reviews?, How is the pricing structure?, how does it compare against glean?"
48
+ )
49
+ section_length = gr.Slider(
50
+ label="Section Length (words)",
51
+ minimum=300,
52
+ maximum=500,
53
+ step=100,
54
+ value=300)
55
+ groq_key = gr.Textbox(
56
+ label="Cerebras API Key",
57
+ info="#### Get your free Cerebras key from [cloud.cerebras.ai/](https://cloud.cerebras.ai/)",
58
+ placeholder="Enter your Cerebras key...",
59
+ value="your_groq_key"
60
+ )
61
+
62
+ start_btn = gr.Button("Start Research", variant="primary")
63
+ progress_output = gr.HTML(label="Progress Bar")
64
+ with gr.Column():
65
+ with gr.Column(elem_classes="report-container"):
66
+ report_outline = gr.Markdown(label="Report Outline")
67
+
68
+ start_btn.click(
69
+ fn=run_research,
70
+ inputs=[topic_input, questions_input, section_length, groq_key],
71
+ outputs=[start_btn, progress_output, report_outline],
72
+ show_progress="bar"
73
+ )
74
+
75
+ demo.queue().launch()
76
+
77
 
78
  if __name__ == "__main__":
79
+ main()
improve_content.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, TypedDict
2
+ from llm_config import get_llm_instructor, call_llm
3
+ from pydantic import BaseModel, Field
4
+ import ui
5
+ import prompts
6
+ from search import fetch_search_results, format_search_results
7
+ import random
8
+ import time
9
+ from dotenv import load_dotenv
10
+ import re
11
+
12
+
13
+ load_dotenv()
14
+
15
+ class RoundtableMessage(BaseModel):
16
+ response: str = Field(..., title="Your response")
17
+ follow_up: str = Field(..., title="Your follow-up question")
18
+ next_persona: str = Field(..., title="Who you are asking the question to")
19
+
20
+ class ContentState(TypedDict):
21
+ previous_messages: List[dict]
22
+ content: str
23
+ expert_question: str
24
+ iteration: int
25
+ full_messages: List[str]
26
+ refernces : str
27
+
28
+ class Queries(BaseModel):
29
+ queries : List[str] = Field(..., title="List of queries to search for")
30
+
31
+ class PersonaQuestion(BaseModel):
32
+ question: str = Field(..., title="Your question for the expert")
33
+
34
+ class StrucutredAnswer(BaseModel):
35
+ answer_response: str = Field(..., title="The response to the question with citations")
36
+ references_used: List[int] = Field(..., title="The references used to answer the question")
37
+
38
+
39
+
40
+
41
+
42
+ class ImproveContent:
43
+ def __init__(self, section_topic, section_description, section_key_questions, personas):
44
+ self.section_topic = section_topic
45
+ self.section_description = section_description
46
+ self.section_key_questions = section_key_questions
47
+
48
+ self.client = get_llm_instructor()
49
+ self.num_search_result = 1
50
+ self.num_interview_rounds = 3
51
+ self.personas = personas
52
+ self.warm_start_rounds = 10
53
+
54
+
55
+
56
+ # Define the initial state
57
+ def create_initial_state(self) -> ContentState:
58
+ return {
59
+ "expert_question": "",
60
+ "iteration": 0,
61
+ 'previous_messages': [],
62
+ 'full_messages': [],
63
+ 'references' : ''
64
+ }
65
+
66
+
67
+ def expert_question_generator(self, persona, state: ContentState) -> ContentState:
68
+
69
+ response = call_llm(
70
+ instructions=prompts.QUALITY_CHECKER_INSTRUCTIONS,
71
+ additional_messages= state['previous_messages'],
72
+ context={
73
+ "title_description": self.section_description + ":" + self.section_topic,
74
+ "key_questions": self.section_key_questions,
75
+ 'persona': persona.persona
76
+ },
77
+ response_model=PersonaQuestion,
78
+ logging_fn="quality_checker"
79
+ )
80
+ ui.system_sub_update("-------------------")
81
+ ui.system_sub_update(f'{persona.name} ({persona.role},{persona.affiliation}):')
82
+ ui.system_sub_update(response.question)
83
+ ui.system_sub_update("-------------------")
84
+ state["expert_question"] = response.question
85
+ state['previous_messages'].append({'role' : 'assistant', 'content': response.question})
86
+ state['full_messages'].append(response.question)
87
+ return state
88
+
89
+ def replace_references(self, text: str, references_list: List[int]) -> str:
90
+ """Helper method to replace bracketed references with unique numbering."""
91
+ for idx in references_list:
92
+ text = text.replace(f"[{idx}]", f"[{self.num_search_result}]")
93
+ self.num_search_result += 1
94
+ return text
95
+
96
+ def answer_question(self, persona, state: ContentState):
97
+
98
+ queries = call_llm(
99
+ instructions=prompts.IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS,
100
+ model_type='fast',
101
+ context={
102
+ "section_topic": self.section_topic,
103
+ "expert_question": state["expert_question"],
104
+ 'persona': persona.persona
105
+ },
106
+ response_model=Queries,
107
+ logging_fn="improve_content_create_query"
108
+ )
109
+ search_results, search_results_list = yield from fetch_search_results(queries.queries, self.task_status, self.section_topic, self.update_ui_fn)
110
+
111
+
112
+ # Hit the search engine to fetch relevant documents
113
+ if search_results_list == []:
114
+ queries = call_llm(
115
+ instructions=prompts.IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS,
116
+ model_type='fast',
117
+ context={
118
+ "section_topic": self.section_topic,
119
+ "expert_question": state["expert_question"],
120
+ 'persona': persona.persona
121
+ },
122
+ response_model=Queries,
123
+ logging_fn="improve_content_create_query_fallback"
124
+ )
125
+ search_results, search_results_list = yield from fetch_search_results(queries.queries, self.task_status,self.section_topic, self.update_ui_fn)
126
+
127
+
128
+ response = call_llm(
129
+ instructions=prompts.IMPORVE_CONTENT_ANSWER_QUERY_INSTRUCTION,
130
+ model_type='rag',
131
+ context={
132
+ "section_topic": self.section_topic,
133
+ "expert_question": state["expert_question"],
134
+ "search_results": search_results,
135
+ 'persona' : persona.persona
136
+ },
137
+ response_model=StrucutredAnswer,
138
+ logging_fn="improve_content_answer_query"
139
+ )
140
+
141
+ state["content"] =response.answer_response
142
+
143
+
144
+ references_used = format_search_results([search_results_list[i-1] for i in response.references_used])
145
+ # Find all unique bracketed references in the search results
146
+ bracketed_refs = re.findall(r'\[(\d+)\](?=\s*Title:)', search_results)
147
+
148
+ #Replace citations[2,3,4] with [2][3][4]
149
+ cited_references_raw = re.findall(r'\[(\d+(?:,\s*\d+)*)\]', response.answer_response)
150
+ for group in cited_references_raw:
151
+ nums_list = group.split(',')
152
+ new_string = ''.join(f'[{n.strip()}]' for n in nums_list)
153
+ old_string = f'[{group}]'
154
+ response.answer_response = response.answer_response.replace(old_string, new_string)
155
+ # Replace each reference number with its a unique search number
156
+ for ref in bracketed_refs:
157
+ search_results = search_results.replace(f'[{ref}]', f"[{self.num_search_result}]")
158
+ response.answer_response = response.answer_response.replace(f'[{ref}]', f"[{self.num_search_result}]")
159
+ self.num_search_result += 1
160
+
161
+ ui.system_sub_update("-------------------")
162
+ ui.system_sub_update('Content:')
163
+ ui.system_sub_update(response.answer_response)
164
+ ui.system_sub_update("-------------------")
165
+ state['previous_messages'].append({'role' : 'user', 'content' : response.answer_response})
166
+ state['full_messages'].append(response.answer_response)
167
+ state['references'] = state['references'] + '\n\n' + search_results
168
+ state["iteration"] += 1
169
+
170
+ return state
171
+
172
+ def create_and_run_interview(self, task_status, update_ui_fn):
173
+ """Runs an iterative process of generating questions and answers
174
+ until the iteration limit is reached."""
175
+ self.task_status = task_status
176
+ self.update_ui_fn = update_ui_fn
177
+ discussion_messages = []
178
+ for persona in self.personas:
179
+ ui.system_update(f"Starting discussion with : {persona.name}: {persona.role}, {persona.affiliation}")
180
+ state = self.create_initial_state()
181
+ while state["iteration"] <= self.num_interview_rounds:
182
+ state = self.expert_question_generator(persona, state)
183
+ state = yield from self.answer_question(persona, state)
184
+ discussion_messages.extend(state['previous_messages'])
185
+ self.final_state = state
186
+ return discussion_messages
187
+
188
+ def generate_final_section(self, synopsis):
189
+ return '\n\n'.join(self.final_state['full_messages']), self.final_state['references']
190
+
191
+
192
+ def warm_start_discussion(self):
193
+ """Warm start the discussion with existing personas"""
194
+
195
+ messages = [f"{self.personas[0].name}: Hi! Let's get started!"]
196
+ selected_persona = random.choice(self.personas)
197
+ for _ in range(self.warm_start_rounds):
198
+
199
+ # Get the last 5 messages if there are more than 5
200
+ recent_messages = messages[-5:] if len(messages) > 5 else messages
201
+
202
+ message = call_llm(
203
+ instructions=prompts.ROUNDTABLE_DISCUSSION_INSTRUCTIONS,
204
+ model_type='fast',
205
+ context={
206
+ "persona_name" : selected_persona.name,
207
+ "persona_role" : selected_persona.role,
208
+ "persona_affiliation" : selected_persona.affiliation,
209
+ "persona_focus" : selected_persona.focus,
210
+ "personas" :
211
+ "\n\n".join([p.name + '\n' + p.persona for p in self.personas if p != selected_persona]),
212
+ "discussion" : "\n\n".join(recent_messages)
213
+ },
214
+ response_model=RoundtableMessage,
215
+ logging_fn="roundtable_discussion"
216
+ )
217
+ ui.system_sub_update("\n\n" + selected_persona.name + ": " + message.response + '\n' + message.follow_up)
218
+ messages.append(selected_persona.name + ": " + message.response + '\n' + message.follow_up)
219
+ selected_persona = [p for p in self.personas if p.name == message.next_persona][0]
220
+ time.sleep(3)
221
+ return messages
222
+
223
+
224
+
225
+ if __name__ == "__main__":
226
+ section_name = 'Glean Search in the Enterprise Search Market'
227
+ section_description = 'Positioning and Competition'
228
+ section_key_questions = ['how is glean positioned in the enterprise search market?', "who are the main competitors in this space?"]
229
+ personas = ['\nRole: Business Analyst\nAffiliation: Enterprise Software Consultant\nDescription: Specializes in helping organizations implement and optimize AI-powered tools for improved productivity and knowledge management. Will analyze Glean and Copilot from a business user perspective.\n']
230
+ improve_content = ImproveContent(section_name, section_description, section_key_questions, personas)
231
+ improved_content = improve_content.create_and_run_interview()
232
+ improve_content.generate_final_section()
233
+ print(improved_content)
llm_config.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tenacity import wait_exponential, Retrying, stop_after_attempt
3
+ from dotenv import load_dotenv
4
+ import google.generativeai as genai
5
+ from groq import Groq
6
+ import instructor
7
+ from openai import OpenAI
8
+ from cerebras.cloud.sdk import Cerebras
9
+ from limits import storage, strategies, parse
10
+ from typing import List, TypedDict, Union, Annotated, Dict, Any, Tuple
11
+ import time
12
+ from instructor.exceptions import InstructorRetryException
13
+
14
+ memory_storage = storage.MemoryStorage()
15
+ moving_window = strategies.MovingWindowRateLimiter(memory_storage)
16
+ rate_limit = parse("10/minute")
17
+
18
+
19
+ MODEL = 'gemini-1.5-flash-latest'
20
+ MODEL_FAST = 'gemini-1.5-flash-latest'
21
+ MODEL_RAG = 'gemini-1.5-flash-latest'
22
+
23
+ # Global variable to track LLM usage
24
+ _LLM_USAGE = { MODEL: {"input_tokens": 0, "output_tokens": 0},
25
+ MODEL_FAST: {"input_tokens": 0, "output_tokens": 0},
26
+ MODEL_RAG: {"input_tokens": 0, "output_tokens": 0}}
27
+ _LLM_USAGE_SPLIT = []
28
+
29
+ def get_llm_usage():
30
+ print(_LLM_USAGE)
31
+ print(_LLM_USAGE_SPLIT)
32
+ # Calculate total usage per function
33
+ function_totals = {}
34
+ for entry in _LLM_USAGE_SPLIT:
35
+ fn = entry['function']
36
+ if fn not in function_totals:
37
+ function_totals[fn] = {'total_input': 0, 'total_output': 0}
38
+ function_totals[fn]['total_input'] += entry['input_usage']
39
+ function_totals[fn]['total_output'] += entry['output_usage']
40
+ return _LLM_USAGE, _LLM_USAGE_SPLIT, function_totals
41
+
42
+
43
+
44
+ load_dotenv()
45
+
46
+ LLM_TYPE = 'google'
47
+
48
+ def get_llm_instructor():
49
+ if LLM_TYPE == 'groq':
50
+ return instructor.from_groq(Groq(api_key=os.environ["GROQ_API_KEY"]), mode=instructor.Mode.TOOLS)
51
+
52
+ elif LLM_TYPE == 'openrouter':
53
+ return instructor.from_openai(OpenAI(api_key=os.getenv("OPENROUTER_API_KEY"), base_url="https://openrouter.ai/api/v1"), mode=instructor.Mode.MD_JSON)
54
+
55
+ elif LLM_TYPE == 'cerebras':
56
+ return instructor.from_cerebras(Cerebras(api_key = os.environ['CEREBRAS_API_KEY']), mode = instructor.Mode.CEREBRAS_JSON)
57
+
58
+ elif LLM_TYPE == 'google':
59
+ return instructor.from_gemini(client=genai.GenerativeModel(model_name="models/gemini-1.5-flash-latest",
60
+ generation_config=genai.configure(api_key= os.environ['GEMINI_API_KEY'])),
61
+ mode=instructor.Mode.GEMINI_JSON)
62
+
63
+
64
+ def call_llm(instructions: str, context: dict, response_model: Any, model_type:str = 'slow', additional_messages: List[Dict[str, str]] = None, logging_fn = 'default') -> Any:
65
+ """Standardizes LLM calls with optional retries."""
66
+ messages = [{"role": "system", "content": instructions}]
67
+ if additional_messages:
68
+ messages.extend(additional_messages)
69
+
70
+ while not moving_window.test(rate_limit):
71
+ time.sleep(0.1)
72
+
73
+ model = MODEL_RAG if model_type == 'rag' else (MODEL if model_type == 'slow' else MODEL_FAST)
74
+
75
+ try:
76
+ client = get_llm_instructor()
77
+ if LLM_TYPE == 'google':
78
+ response, completion = client.chat.completions.create_with_completion(
79
+ messages=messages,
80
+ context=context,
81
+ response_model=response_model
82
+ )
83
+ else:
84
+ response, completion = client.chat.completions.create_with_completion(
85
+ model=model,
86
+ messages=messages,
87
+ temperature=0.5,
88
+ context=context,
89
+ max_retries=Retrying(stop = stop_after_attempt(2), wait= wait_exponential(multiplier=1.5, min=10, max=60)),
90
+ response_model=response_model
91
+ )
92
+
93
+ except InstructorRetryException as e:
94
+ print(e)
95
+ while not moving_window.test(rate_limit):
96
+ time.sleep(0.1)
97
+ def retry_callback(retry_state):
98
+ # Increase temperature on each retry
99
+ print('retrying....')
100
+ new_temp = 0.1 + (retry_state.attempt_number * 0.2)
101
+ return max(0.1, min(0.9, new_temp)) # Keep between 0.1 and 0.9
102
+ if LLM_TYPE == 'google':
103
+ response, completion = client.chat.completions.create_with_completion(
104
+ messages=messages,
105
+ context=context,
106
+ response_model=response_model,
107
+ max_retries=Retrying(
108
+ stop=stop_after_attempt(3),
109
+ wait=wait_exponential(multiplier=1.5, min=10, max=60),
110
+ before=retry_callback
111
+ )
112
+ )
113
+ else:
114
+ response, completion = client.chat.completions.create_with_completion(
115
+ model=model,
116
+ messages=messages,
117
+ context=context,
118
+ response_model=response_model,
119
+ max_retries=3
120
+ )
121
+
122
+ # Update usage statistics
123
+ usage = completion.usage_metadata if LLM_TYPE == 'google' else completion.usage
124
+ input_tokens = usage.prompt_token_count if LLM_TYPE == 'google' else usage.prompt_tokens
125
+ output_tokens = usage.candidates_token_count if LLM_TYPE == 'google' else usage.completion_tokens
126
+
127
+ _LLM_USAGE[model]['input_tokens'] += input_tokens
128
+ _LLM_USAGE[model]['output_tokens'] += output_tokens
129
+ _LLM_USAGE_SPLIT.append({
130
+ 'function': logging_fn,
131
+ 'input_usage': input_tokens,
132
+ 'output_usage': output_tokens
133
+ })
134
+
135
+
136
+ return response
137
+
138
+
139
+ if __name__ == "__main__":
140
+ class ResponseModel(TypedDict):
141
+ answer: str
142
+
143
+ instructions = "What are the key differences between Glean Search and MS Copilot?"
144
+ context = {}
145
+ response_model = ResponseModel
146
+ print(call_llm(instructions, context, response_model))
prompts.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FIND_SEARCH_TERMS_INSTRUCTIONS = """
2
+ You are writing a {{report_type}} report on the following topic:
3
+ {{original_query}}
4
+
5
+ Report synopsis:
6
+ {{report_synopsis}}
7
+
8
+ You MUST provide exactly 5 search queries to search for information to write this report.
9
+ The search queries should allow you to get a breadth of information related to the topic.
10
+ Make sure the queries are specific enough to find high-quality, relevant sources."""
11
+
12
+ GENERATE_REPORT_OUTLINE_INSTRUCTIONS = """You are an expert technical writer, helping to plan a {{report_type}} report.
13
+
14
+ Your goal is to generate the outline of the sections of the report with a maximum of {{num_sections}} sections.
15
+
16
+ The overall topic of the report is:
17
+
18
+ {{topic}}
19
+
20
+ Use the following roundtable discussion to generate the outline of the report.
21
+
22
+ {{discussion}}
23
+
24
+ This is the expectation of the reader from the report:
25
+
26
+ {{context}}
27
+
28
+ Now, generate the {{num_sections}} sections of the report. Each section should have the following fields:
29
+
30
+ - Name - Name for this section of the report.
31
+ - Description - what needs to be covered in this section?
32
+ - Subsections - titles of the subsections if any.
33
+ - Content - The content of the section, which you will leave blank for now.
34
+
35
+ Ignore the Introduction and Conclusion sections. Respond in JSON format"""
36
+
37
+
38
+
39
+ QUALITY_CHECKER_INSTRUCTIONS = """You are an experienced Wikipedia writer and want to edit a specific section of a page titled:
40
+ {{title_description}}
41
+
42
+ Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
43
+ Now, you are chatting with an expert to get information. Ask good questions, one at a time, to get more useful information.
44
+
45
+ Please **ONLY** ask one question at a time and don't ask what you have asked before.\
46
+ Your questions should be related to {{title_description}}
47
+ Be comprehensive and curious, gaining as much unique insight from the expert as possible.\
48
+
49
+ Stay true to your specific perspective:
50
+
51
+ {{persona}}
52
+
53
+
54
+ Guidelines:
55
+ - Do not introduce yourself or your role in the conversation
56
+ - No need to thank the expert for their answers, just ask your next question.
57
+ - Respond in JSON format only
58
+ """
59
+
60
+ WARM_START_DISCUSSION_INSTRUCTIONS = """You are an experienced Wikipedia writer and want to edit a specific section of a page titled:
61
+ {{title_description}}
62
+
63
+ Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
64
+ Now, you are chatting with an expert to get information.
65
+
66
+ Your questions should be related to {{title_description}}
67
+ Be comprehensive and curious, gaining as much unique insight from the expert as possible.
68
+
69
+ Stay true to your specific perspective:
70
+
71
+ {{persona}}
72
+
73
+ Contiue the following discussion:
74
+
75
+ Guidelines:
76
+ - Do not introduce yourself or your role in the conversation
77
+ - No need to thank the expert for their answers, just ask your next question.
78
+ - Respond in JSON format only
79
+ """
80
+
81
+ # Based on your focus, frame your questions so that you get info on the following:
82
+
83
+ # {{key_questions}}
84
+ IMPROVE_CONTENT_CREATE_QUERY_INSTRUCTIONS = """You are an expert wikipedia writer who can use information effectively.
85
+ Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
86
+ Stay true to your persona and perspective:
87
+
88
+ {{persona}}
89
+
90
+ You are chatting with an expert who wants\
91
+ to write a report on the topic you know : {{section_topic}}
92
+
93
+ Experts Question : {{expert_question}}
94
+
95
+ Generate 3 google search queries to find content that answers the experts question."""
96
+
97
+ CREATE_SYNOPSIS_INSTRUCTIONS = """ You are a marketer for a publishing company and you are tasked with creating a synopsis for a {{report_type}} report.
98
+ Topic: {{topic}}
99
+
100
+ The reader wants some key questions answered : {{key_questions}}
101
+
102
+ Write a synopsis of the report in 5-6 sentences, so the reader knows what to expect.
103
+ """
104
+
105
+ IMPORVE_CONTENT_ANSWER_QUERY_INSTRUCTION = """You are an expert wikipedia writer who can use information effectively.
106
+
107
+ Besides your identity as a Wikipedia writer, you have a specific focus when researching the topic. \
108
+ Stay true to your persona and perspective:
109
+
110
+ {{persona}}
111
+
112
+ You are chatting with an expert who wants\
113
+ to write a report on the topic you know : {{section_topic}}
114
+
115
+ Question : {{expert_question}}
116
+
117
+ Search Results : {{search_results}}
118
+
119
+
120
+ Response Guidelines:
121
+ Make your response as informative as possible and make sure every sentence is supported by the gathered information.
122
+ If the search results is not directly related to the [Topic] and [Question], provide the most relevant answer you can based on the available information, and explain any limitations or gaps.
123
+ You MUST use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3].") referring to the search results.
124
+ Do NOT list the sources at the end, but you need to cite the search results in your response.
125
+ Your response should not exceed 150-200 words.
126
+
127
+ Here's an example of how you must respond:
128
+ <example>:
129
+
130
+ Response: The James Webb Space Telescope (JWST) has revolutionized our understanding of the universe by capturing infrared light, allowing scientists to see the earliest galaxies formed after the Big Bang [1][2][5]. It also provides unparalleled clarity for studying exoplanet atmospheres and stellar formation [3][4].
131
+
132
+ Search Results:
133
+ [1] Title: Webb’s First Images Unveil the Cosmos in Unprecedented Detail
134
+ Snippet: NASA's James Webb Space Telescope captures images of ancient galaxies formed just 200 million years after the Big Bang. Its ability to detect faint infrared light gives scientists new insights into the early universe.
135
+ URL: XYZ
136
+
137
+ [2] Title: The Science Behind Webb: Seeing the Unseen
138
+ Snippet: Webb’s infrared instruments allow it to pierce through cosmic dust and gas, providing detailed views of star and planet formation that were previously obscured.
139
+ URL: XYZ
140
+
141
+ [3] Title: A Closer Look at Exoplanets with JWST
142
+ Snippet: The James Webb Space Telescope offers an unprecedented ability to analyze exoplanet atmospheres, identifying key molecules like water vapor and methane that could indicate potential habitability.
143
+ URL: XYZ
144
+
145
+ [4] Title: Stellar Nurseries Revealed: Webb’s Role in Understanding Star Formation
146
+ Snippet: Webb has provided high-resolution images of stellar nurseries, helping scientists understand how stars form and evolve in various cosmic environments.
147
+ URL: XYZ
148
+
149
+ [5] Title: How Webb's Infrared Technology Changes Our View of Space
150
+ Snippet: Unlike the Hubble, Webb operates primarily in the infrared spectrum, which is crucial for detecting the faintest and most distant objects in the universe.
151
+ URL: XYZ
152
+
153
+
154
+
155
+ .......
156
+ </example>
157
+
158
+ Respond in JSON format without markdown
159
+
160
+ """
161
+
162
+ GENERATE_ROUNDTABLE_PERSONAS_INSTRUCTIONS = """You need to select a diverse (and distinct) group of max {{num_personas}} experts who will participate in a roundtable discussion on the topic : {{topic}}
163
+ The experts will help the audience understand unique perspectives that need to be covered in the report.
164
+ Its important to go broad so that you can get different perspectives on the topic.\
165
+
166
+ For example, if the discussion focus is about a recent event at a specific university, consider inviting students, faculty members, journalists covering the event, university officials, and local community members.
167
+ You can use the provided context for inspiration. For each expert, add a 2-3 line description of what they will focus on and how they will make the roundtable discussion interesting for the audience.
168
+
169
+ Search Engine snippets of the topic:
170
+ {{context}}
171
+
172
+ Base your personas on the following expectations from the audience:
173
+ {{report_synopsis}}
174
+ """
175
+
176
+ ROUNDTABLE_DISCUSSION_INSTRUCTIONS = """You are {{persona_name}}, a {{persona_role}} working at {{persona_affiliation}}. You are participating in a roundtable discussion on the topic: {{topic}}
177
+ Your focus area is : {{persona_focus}}
178
+
179
+ Along with you, the following experts are participating in the roundtable discussion:
180
+ {{personas}}
181
+
182
+ Here is the discussion so far:
183
+
184
+ {{discussion}}
185
+
186
+ Its your turn to contribute to the discussion, response with a short answer not exceeding 200 words. Also ask the next expert a question to keep the discussion going.
187
+ """
188
+
189
+
190
+
191
+ GENERATE_PERSONAS_INSTRUCTIONS = """You need to select a diverse (and distinct) group of max {{num_personas}} experts who will work together to create a comprehensive {{type_of_report}} report on the topic : {{topic}}
192
+
193
+ The experts will help you understand the unique questions and perspectives that need to be covered in the report.
194
+ Its important to go broad so that you can get different perspectives on the topic.\
195
+
196
+ For example, if the discussion focus is about a recent event at a specific university, consider inviting students, faculty members, journalists covering the event, university officials, and local community members.
197
+ You can use the provided context for inspiration. For each expert, add a 2-3 line description of what they will focus on and how they will help.
198
+
199
+ The expectation from the reader is a report with the following synopsis:
200
+ {{report_synopsis}}
201
+ """
202
+
203
+ ORGANIZE_MINDMAP_INSTRUCTIONS = """You are a seasoned research assistant tasked with organizing the key concepts and ideas as a mindmap on the following topic:
204
+ {{topic}}
205
+
206
+ Here is your mindmap so far:
207
+ {{mindmap}}
208
+
209
+ Condense this discussion into the mindmap.
210
+ {{discussion}}
211
+
212
+ You can choose to create_new_topic or insert_into_existing_topic.
213
+
214
+ If creating a new topic, provide a name for the new topic along with the subtopics
215
+
216
+ If inserting into an existing topic, provide the name of the existing topic and the subtopics to insert.
217
+
218
+ Respond in JSON format only.
219
+ """
220
+
221
+
222
+
223
+ REORGANIZE_TOPIC_INSTRUCTIONS = """
224
+ Given the current mindmap structure, reorganize it into a more balanced and coherent structure.
225
+ Each topic should have no more than {{max_subtopics}} subtopics.
226
+
227
+ {{mindmap}}
228
+
229
+ Organize these subtopics into 2-3 new topics that are more focused and manageable.
230
+ Return:
231
+ 1. The new topics with their subtopics
232
+ 2. Mapping between the old structure to the new structure. Eg: {old_topic/old_sub_topic : new_topic/new_sub_topic}
233
+ """
234
+
235
+
236
+ WRITE_TOPIC_SUMMARY_INSTRUCTIONS = """You are a seasoned research assistant tasked with writing a subsection based on the following discussion:
237
+ {{discussion}}
238
+
239
+ References:
240
+ {{references}}
241
+
242
+ Section Title: {{section_title}}
243
+ Here's a synoposis of the full report (you are only writing one section of it). {{synopsis}} Keep this theme in mind when writing the summary.
244
+
245
+
246
+ Give an title to the summary you generate.
247
+ Make your response as informative as possible and make sure every sentence is supported by the gathered information.
248
+ You MUST use [1], [2], ..., [n] in line (for example, "The capital of the United States is Washington, D.C.[1][3].") referring to the search results.
249
+ Do NOT list the sources at the end, but you need to cite the search results in your response.
250
+ """
251
+
252
+ WRITE_SECTION_INSTRUCTIONS = """
253
+ You are a seasoned wikipedia writer tasked with writing a section of a {{report_type}} report on: {{topic}}
254
+
255
+ You have gathered information on multiple topics each with citations.
256
+
257
+ {{gathered_info}}
258
+
259
+ Now you need to write the section on :
260
+ Section Title: {{section_title}}
261
+ Section Description: {{section_description}}
262
+ Readers expect to answer the following questions after reading this section: {{section_questions}}
263
+
264
+ Here's a synoposis of the full report, of which you are writing the section: {{section_title}}:
265
+ {{synopsis}}
266
+
267
+ Keep this theme in mind when writing the summary.
268
+
269
+ Guidelines for Section Writing:
270
+
271
+ 1. Keep the citations and reference numbers as-is. Do NOT change the reference numbers.
272
+ 2. In case you want to merge information, merge the citations as well (eg: sentenceA[2], sentenceB[3] --> the citation should be [2][3])
273
+ 3. Keep the langugage simple and easy to understand for a layman.
274
+ 4. Create subsections as required.
275
+ 5. The whole section should not be more than {{section_length}} words, including the subsections.
276
+ 5. Maintain markdown format with headers, bold, italics, newline and bullet points - visual styling is important for the reader.
277
+
278
+ Write the content for the section : {{section_title}}
279
+
280
+
281
+
282
+ """
research_manager.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ui
2
+ from typing import List
3
+ from pydantic import BaseModel, Field
4
+ import time
5
+ import gradio as gr
6
+ from llm_config import call_llm, get_llm_usage
7
+ import prompts
8
+ from colorama import Fore, Style
9
+ # Add these imports at the top
10
+ from search import fetch_search_results
11
+ from improve_content import ImproveContent
12
+ import re
13
+
14
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ # logger = logging.getLogger(__name__)
16
+
17
+ class Section(BaseModel):
18
+ name: str = Field(
19
+ description="Name for this section of the report.",
20
+ )
21
+ description: str = Field(
22
+ description="Brief overview of the main topics and concepts to be covered in this section.",
23
+ )
24
+ questions: List[str] = Field(
25
+ description="Key Questions to answer in this section."
26
+ )
27
+ content: str = Field(
28
+ description="The content of the section."
29
+ )
30
+
31
+
32
+ class Sections(BaseModel):
33
+ sections: List[Section] = Field(
34
+ description="Sections of the report.",
35
+ )
36
+
37
+ @property
38
+ def as_str(self) -> str:
39
+ subsections = "\n\n".join(
40
+ f"## {section.name}\n\n-{section.description}\n\n- Questions: {'\n\n'.join(section.questions)}\n\n- Content: {section.content}\n"
41
+ for section in self.sections or []
42
+ )
43
+ return subsections
44
+
45
+ def print_sections(self) -> str:
46
+ return '\n\n'.join([s.content for s in self.sections])
47
+
48
+ class ResearchArea(BaseModel):
49
+ area : str = Field(..., title="Research Area")
50
+ search_terms : str = Field(..., title = "Search Term", description = "Search query that will help you find information")
51
+
52
+ class ResearchFocus(BaseModel):
53
+ areas : List[ResearchArea] = Field(..., title="Research Areas")
54
+
55
+
56
+ class RelevantSearchResults(BaseModel):
57
+ relevant_search_results : List[int] = Field(..., title="Relevant Search Results", description="The position of the search result in the search results list")
58
+ reasoning : List[str] = Field(..., title="Reasoning", description="Reasoning for selecting the search results")
59
+
60
+ class SearchTerm(BaseModel):
61
+ query : str = Field(..., title="Search Query")
62
+ #time_range : str = Field(..., title="Time Range", description="d/w/m/y/none")
63
+
64
+ class SearchTermsList(BaseModel):
65
+ queries : List[str] = Field(..., title="Search Terms as a list")
66
+
67
+
68
+ class Editor(BaseModel):
69
+ name: str = Field(
70
+ description="Name of the editor.",
71
+ )
72
+ affiliation: str = Field(
73
+ description="Primary affiliation of the editor.",
74
+ )
75
+ role: str = Field(
76
+ description="Role of the editor in the context of the topic.",
77
+ )
78
+ focus: str = Field(
79
+ description="Description of the editor's focus area, concerns and how they will help.",
80
+ )
81
+
82
+ @property
83
+ def persona(self) -> str:
84
+ return f"\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.focus}\n"
85
+
86
+
87
+ class Perspectives(BaseModel):
88
+ editors: List[Editor] = Field(
89
+ description="Comprehensive list of editors with their roles and affiliations.",
90
+ )
91
+
92
+ class ReportSynopsis(BaseModel):
93
+ synopsis: str= Field(..., title="Report Synopsis", description="A synopsis talking about what the reader can expect")
94
+
95
+
96
+ class SectionContent(BaseModel):
97
+ content: str = Field(..., title="Section Content", description="The content of the section")
98
+
99
+
100
+ class ResearchManager:
101
+ """Manages the research process including analysis, search, and documentation"""
102
+ def __init__(self, research_task):
103
+ self.use_existing_outline = True
104
+ self.research_task = research_task
105
+ self.report_synopsis = ''
106
+ self.personas = ''
107
+ self.gradio_report_outline = ''
108
+ self.task_status = {
109
+ 'synopsis_draft' : {"name": "Creating synopsis of the report...", "status": "pending"},
110
+ 'gathering_info' : {"name": "Gathering Info on the topic...", "status": "pending"},
111
+ 'running_searches' : {"name": "Run search...", "status": "pending"},
112
+ 'mock_discussion' : {"name": "Conducting mock discussions...", "status": "pending"},
113
+ 'generating_outline': {"name": "Generating a draft outline...", "status": "pending"},
114
+ }
115
+
116
+
117
+
118
+ def extract_citation_info(self,text):
119
+ """
120
+ Extract citation number and URL from citation text
121
+ """
122
+ references = {}
123
+ for ref in text:
124
+ # Find citation number
125
+ citation_match = re.search(r'\[(\d+)\]', ref)
126
+ citation_number = citation_match.group(1) if citation_match else None
127
+
128
+ # Find URL
129
+ url_match = re.search(r'URL: (https?://\S+)', ref)
130
+ url = url_match.group(1) if url_match else None
131
+ references[citation_number] = {
132
+ 'url': url,
133
+ 'reference_text': ref
134
+ }
135
+ return references
136
+
137
+ def section_writer(self, section: Section):
138
+ """Given an outline of a section, generate search queries,
139
+ perform searches and generate the section content"""
140
+
141
+ improve_content = ImproveContent(section.name,
142
+ section.description,
143
+ section.questions,
144
+ self.personas.editors
145
+ )
146
+ improved_content = yield from improve_content.create_and_run_interview(self.task_status, self.update_gradio)
147
+ content, references = improve_content.generate_final_section(self.report_synopsis)
148
+
149
+ self.task_status[section.name]["name"] = "Writing Section: " + section.name
150
+ yield from self.update_gradio()
151
+
152
+ ui.system_update(f"Writing Section: {section.name}")
153
+ section_content = call_llm(
154
+ instructions=prompts.WRITE_SECTION_INSTRUCTIONS,
155
+ model_type='slow',
156
+ context={"section_description": section.description,
157
+ "gathered_info" : '\n\n'.join(content),
158
+ "topic": self.research_task['topic'],
159
+ "section_title" : section.name,
160
+ "synopsis" : self.report_synopsis,
161
+ "section_questions" : '\n'.join(section.questions),
162
+ 'report_type': self.research_task['report_type'],
163
+ 'section_length': self.research_task['section_length']},
164
+ response_model=SectionContent,
165
+ logging_fn='write_section_instructions'
166
+ )
167
+
168
+ #references = '\n\n'.join(references)
169
+ references_dict = self.extract_citation_info(references.split('\n\n'))
170
+
171
+ #Replacing citations with [2,3,4] format with [2][3][4]
172
+ cited_references_raw = re.findall(r'\[(\d+(?:,\s*\d+)*)\]', section_content.content)
173
+ for group in cited_references_raw:
174
+ nums_list = group.split(',')
175
+ new_string = ''.join(f'[{n.strip()}]' for n in nums_list)
176
+ old_string = f'[{group}]'
177
+ section_content.content = section_content.content.replace(old_string, new_string)
178
+
179
+ parsed_cited_references = []
180
+ for ref_group in cited_references_raw:
181
+ for ref_no in ref_group.split(','):
182
+ parsed_cited_references.append(ref_no.strip())
183
+
184
+ used_references = {}
185
+ uncited_sources= []
186
+ for reference_no in parsed_cited_references:
187
+ reference = references_dict.get(reference_no)
188
+ if reference:
189
+ used_references[reference_no] = reference
190
+ else:
191
+ print(f"Reference {reference_no} not found")
192
+ uncited_sources.append(reference_no)
193
+ section_content.content = section_content.content.replace(f"[{reference_no}]", "[!]")
194
+
195
+ for ref_no, data in used_references.items():
196
+ if data["url"]:
197
+ section_content.content = section_content.content.replace(f"[{ref_no}]", f"[[{ref_no}]]({data['url']})")
198
+
199
+ section.content = section_content.content
200
+ print(section_content.content)
201
+ self.task_status[section.name]["status"] = "done"
202
+ yield from self.update_gradio(report_outline_str=self.report_outline.print_sections(), button_disable=False)
203
+ ui.system_update("Waiting for 5 seconds before next section")
204
+ time.sleep(5)
205
+ return section
206
+
207
+
208
+ def _generate_report_outline(self):
209
+ """Use LLM to generate focus areas for research based on the original query"""
210
+ ui.system_update(f"\nGathering Context..")
211
+ self.task_status['gathering_info']["status"] = "running"
212
+ yield from self.update_gradio()
213
+
214
+ queries = call_llm(
215
+ instructions=prompts.FIND_SEARCH_TERMS_INSTRUCTIONS,
216
+ model_type='fast',
217
+ context={
218
+ "report_type": self.research_task['report_type'],
219
+ "original_query": self.research_task['topic'],
220
+ "report_synopsis": self.report_synopsis,
221
+ },
222
+ response_model=SearchTermsList,
223
+ logging_fn='find_search_terms_instructions'
224
+ )
225
+
226
+
227
+ self.task_status['running_searches']["status"] = "running"
228
+
229
+ yield from self.update_gradio()
230
+
231
+ formatted_results, results = yield from fetch_search_results(query=queries.queries,
232
+ task_status=self.task_status,
233
+ task_name = 'running_searches',
234
+ fn = self.update_gradio)
235
+ self.context = formatted_results
236
+
237
+ self.task_status['running_searches']["status"] = "done"
238
+ self.task_status['gathering_info']["status"] = "done"
239
+ self.task_status['mock_discussion']["status"] = "running"
240
+ yield from self.update_gradio()
241
+
242
+
243
+ personas = call_llm(
244
+ instructions=prompts.GENERATE_ROUNDTABLE_PERSONAS_INSTRUCTIONS,
245
+ model_type='slow',
246
+ context={"context": self.context,
247
+ "topic": self.research_task['topic'],
248
+ "report_synopsis": self.report_synopsis,
249
+ 'type_of_report': self.research_task['report_type'],
250
+ 'num_personas': 5},
251
+ response_model=Perspectives,
252
+ logging_fn='generate_roundtable_personas_instructions'
253
+ )
254
+
255
+ self.task_status['mock_discussion']["name"] = "Started discussions..."
256
+
257
+
258
+ print(personas)
259
+
260
+ yield from self.update_gradio()
261
+
262
+
263
+ improve_content = ImproveContent(self.research_task['topic'],
264
+ "This section will focus on a comprehensive overview of glean",
265
+ self.research_task['key_questions'],
266
+ personas.editors)
267
+ warm_start_discussion = improve_content.warm_start_discussion()
268
+
269
+ self.task_status['mock_discussion']["name"] = "Mock discussions complete"
270
+ self.task_status['mock_discussion']["status"] = "done"
271
+ self.task_status['generating_outline']["status"] = "running"
272
+ yield from self.update_gradio()
273
+
274
+ ui.system_update("\nGenerating Report Outline..")
275
+
276
+
277
+
278
+
279
+ report_outline = call_llm(
280
+ instructions=prompts.GENERATE_REPORT_OUTLINE_INSTRUCTIONS,
281
+ model_type='slow',
282
+ context={
283
+ "report_type": self.research_task['report_type'],
284
+ "topic": self.research_task['topic'],
285
+ "context": self.context,
286
+ "discussion": '\n'.join(warm_start_discussion),
287
+ 'num_sections': 3
288
+ },
289
+ response_model=Sections,
290
+ logging_fn='generate_report_outline_instructions'
291
+ )
292
+
293
+ self.task_status['generating_outline']["status"] = "done"
294
+ yield from self.update_gradio(report_outline_str=report_outline.as_str)
295
+ print(report_outline.as_str)
296
+
297
+ return report_outline
298
+
299
+
300
+ def validate_outline_with_human(self, report_outline: Sections) -> Sections:
301
+ """Ask the human feedback and improve the report outline till they say 'OK' """
302
+
303
+ while True:
304
+ ui.system_update("\nPlease provide feedback on the generated report outline")
305
+ feedback = ui.get_multiline_input()
306
+ if feedback.lower() == 'ok':
307
+ return report_outline
308
+ ui.system_update("\nImproving the report outline based on your feedback")
309
+ extract_sections_chain = prompts.IMPROVE_REPORT_OUTLINE_PROMPT | self.llm.with_structured_output(Sections)
310
+ report_outline = extract_sections_chain.invoke({"topic": self.research_task['topic'], "feedback": feedback, "report_outline": report_outline.as_str})
311
+ ui.system_output(report_outline.as_str)
312
+
313
+
314
+
315
+ def create_report_synopsis(self):
316
+ return call_llm(
317
+ instructions=prompts.CREATE_SYNOPSIS_INSTRUCTIONS,
318
+ model_type='fast',
319
+ context={
320
+ "report_type": self.research_task['report_type'],
321
+ "topic": self.research_task['topic'],
322
+ "key_questions": self.research_task['key_questions'],
323
+ },
324
+ response_model=ReportSynopsis,
325
+ logging_fn='create_synopsis_instructions'
326
+ )
327
+
328
+
329
+ def update_gradio(self, report_outline_str = '', button_disable = False):
330
+ if report_outline_str != '':
331
+ self.gradio_report_outline = report_outline_str
332
+
333
+ yield [gr.update(interactive=button_disable), self.update_ui(), self.gradio_report_outline]
334
+
335
+
336
+ def start_research(self):
337
+ """Main research loop with comprehensive functionality"""
338
+
339
+ self.task_status['synopsis_draft']["status"] = "running"
340
+ yield from self.update_gradio()
341
+ ui.system_update(f"Starting research on: {self.research_task['topic']}")
342
+
343
+
344
+ ui.system_update("\nGenerating report outline")
345
+ self.report_synopsis = self.create_report_synopsis()
346
+ self.task_status['synopsis_draft']["status"] = "done"
347
+ yield from self.update_gradio()
348
+
349
+ self.report_outline = yield from self._generate_report_outline()
350
+ #self.report_outline = self.validate_outline_with_human(self.report_outline)
351
+
352
+ for section in self.report_outline.sections:
353
+ self.task_status[section.name] = {"name": f"Starting Section: {section.name}", "status": "pending"}
354
+
355
+ yield from self.update_gradio()
356
+
357
+ ui.system_update("\nGenerating personas for writing sections")
358
+ self.personas = call_llm(
359
+ instructions=prompts.GENERATE_PERSONAS_INSTRUCTIONS,
360
+ model_type='slow',
361
+ context={
362
+ "topic": self.research_task['topic'],
363
+ "report_synopsis": self.report_synopsis,
364
+ 'type_of_report': self.research_task['report_type'],
365
+ 'num_personas': 2},
366
+ response_model=Perspectives,
367
+ logging_fn='generate_personas_instructions'
368
+ )
369
+ ui.system_update("\nWriting Sections....")
370
+ for section in self.report_outline.sections:
371
+ self.task_status[section.name]["status"] = "running"
372
+ yield from self.update_gradio()
373
+ ui.system_sub_update(f"\nWriting Section: {section.name}")
374
+ section = yield from self.section_writer(section)
375
+
376
+ for section in self.report_outline.sections:
377
+ print(section.content)
378
+
379
+
380
+
381
+ def update_ui(self):
382
+ completed_tasks = sum(1 for _, task in self.task_status.items() if task["status"] == "done")
383
+ total_tasks = len(self.task_status)
384
+ progress_percentage = int((completed_tasks / total_tasks) * 100)
385
+
386
+ html_output = f"""
387
+ <style>
388
+ .progress-bar-container {{
389
+ width: 100%;
390
+ background-color: #f3f3f3;
391
+ border-radius: 5px;
392
+ overflow: hidden;
393
+ margin-bottom: 20px;
394
+ }}
395
+ .progress-bar {{
396
+ height: 20px;
397
+ width: {progress_percentage}%;
398
+ background-color: #3498db;
399
+ transition: width 0.3s;
400
+ display: flex;
401
+ align-items: center;
402
+ justify-content: center;
403
+ color: white;
404
+ font-weight: bold;
405
+ font-size: 12px;
406
+ }}
407
+ .progress-task {{
408
+ display: flex;
409
+ align-items: center;
410
+ gap: 10px;
411
+ font-family: 'Helvetica Neue', Arial, sans-serif;
412
+ margin: 5px 0;
413
+ font-size: 14px;
414
+ font-weight: 500;
415
+ color: #333;
416
+ }}
417
+ .progress-task .task-name {{
418
+ flex-grow: 1;
419
+ }}
420
+ .progress-task .icon {{
421
+ width: 20px;
422
+ height: 20px;
423
+ }}
424
+ .loading-circle {{
425
+ width: 15px;
426
+ height: 15px;
427
+ border: 3px solid #ccc;
428
+ border-top: 3px solid #3498db;
429
+ border-radius: 50%;
430
+ animation: spin 1s linear infinite;
431
+ }}
432
+ @keyframes spin {{
433
+ 0% {{ transform: rotate(0deg); }}
434
+ 100% {{ transform: rotate(360deg); }}
435
+ }}
436
+ .done-icon {{
437
+ color: #2ecc71;
438
+ font-size: 16px;
439
+ }}
440
+ .checkbox {{
441
+ width: 15px;
442
+ height: 15px;
443
+ border: 1px solid #ccc;
444
+ display: inline-block;
445
+ margin-right: 10px;
446
+ }}
447
+ .milestone {{
448
+ display: inline-block;
449
+ width: 10px;
450
+ height: 10px;
451
+ background-color: #ccc;
452
+ border-radius: 50%;
453
+ margin: 0 5px;
454
+ }}
455
+ .milestone.completed {{
456
+ background-color: #2ecc71;
457
+ }}
458
+ </style>
459
+ <div class='progress-bar-container'>
460
+ <div class='progress-bar'>{progress_percentage}%</div>
461
+ </div>
462
+ <div style='display: flex; justify-content: center; margin-bottom: 20px;'>
463
+ {''.join([f"<div class='milestone {'completed' if i < completed_tasks else ''}'></div>" for i in range(total_tasks)])}
464
+ </div>
465
+ """
466
+
467
+ for _, task in self.task_status.items():
468
+ if task["status"] == "running":
469
+ icon = "<div class='loading-circle'></div>"
470
+ elif task["status"] == "done":
471
+ icon = "<span class='done-icon'>&#10003;</span>"
472
+ else:
473
+ icon = "<div class='checkbox'></div>"
474
+ html_output += f"<div class='progress-task'><span class='icon'>{icon}</span><span class='task-name'>{task['name']}</span></div>"
475
+ return html_output
search.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tavily import TavilyClient
2
+ import ui
3
+ from typing import List, Dict
4
+ import os
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ tavily_search = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
9
+
10
+
11
+ def format_search_results(results: List[Dict]) -> str:
12
+ formatted_results = []
13
+ for result in results:
14
+ formatted_result = ''
15
+ formatted_result = f"[{result.get('number', '')}] Title: {result.get('title', 'N/A')}\n"
16
+ formatted_result += f" Snippet: {result.get('body', '')}\n"
17
+ formatted_result += f" URL: {result.get('href', 'N/A')}\n\n"
18
+ formatted_results.append(formatted_result)
19
+ return "\n".join(formatted_results)
20
+
21
+ def fetch_search_results(query: List[str], task_status, task_name, fn):
22
+
23
+ results = []
24
+ for q in query:
25
+ task_status[task_name]["name"] = "Searching for: " + q
26
+ yield from fn()
27
+ ui.system_sub_update(f"Searching for: {q}")
28
+ try:
29
+ result = tavily_search.search(q)
30
+ result = result["results"][:3]
31
+ result = [{'title': res['title'], 'body': res['content'], 'href': res['url']} for res in result]
32
+ results.extend(result)
33
+ except:
34
+ result = tavily_search.search(q)
35
+ result = result["results"][:3]
36
+ result = [{'title': res['title'], 'body': res['content'], 'href': res['url']} for res in result]
37
+ results.extend(result)
38
+ results = [{'number' : i+1, **result } for i, result in enumerate(results)]
39
+ formatted_results = format_search_results(results)
40
+
41
+ return formatted_results, results
42
+
43
+
44
+ if __name__ == "__main__":
45
+ fetch_search_results(['gitlab', 'github'])
ui.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from colorama import Style, Fore
2
+
3
+ import sys
4
+ import gradio as gr
5
+ import tty
6
+ import termios
7
+
8
+
9
+ def generate_progress_html(percent):
10
+ return f"""
11
+ <div class="progress-bar" style="width: 100%; background-color: #e0e0e0; border-radius: 10px; overflow: hidden; position: relative;">
12
+ <div class="progress-bar-inner" style="height: 20px; background: linear-gradient(90deg, #76c7c0, #4ca1af); width: {percent}%; border-radius: 10px; transition: width 0.3s ease;">
13
+ <div class="progress-text" style="position: absolute; width: 100%; top: 0; left: 0; height: 100%; display: flex; align-items: center; justify-content: center; color: #fff; font-weight: bold;">{percent}%</div>
14
+ </div>
15
+ </div>
16
+ """
17
+
18
+ def get_char():
19
+ """Read a single character from standard input without echo."""
20
+ fd = sys.stdin.fileno()
21
+ old_settings = termios.tcgetattr(fd)
22
+ try:
23
+ tty.setraw(sys.stdin.fileno())
24
+ char = sys.stdin.read(1)
25
+ finally:
26
+ termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
27
+ return char
28
+
29
+
30
+
31
+ def get_multiline_input():
32
+ """Ask the human user_input till they press Ctrl+B."""
33
+ print(Fore.BLUE + 'Enter your multiline text and press Ctrl+B when you are done:')
34
+ user_input = []
35
+ current_line = []
36
+ line_num = 1
37
+
38
+ print(f"{line_num}> ", end='', flush=True)
39
+
40
+ while True:
41
+ char = get_char()
42
+
43
+ if ord(char) == 2: # Ctrl+B
44
+ break
45
+ elif char == '\r' or char == '\n': # Enter key
46
+ line = ''.join(current_line)
47
+ user_input.append(line)
48
+ current_line = []
49
+ line_num += 1
50
+ print(f"\n{line_num}> ", end='', flush=True)
51
+ elif char == '\x7f': # Backspace
52
+ if current_line:
53
+ current_line.pop()
54
+ # Move cursor back, overwrite with space, move back again
55
+ print('\b \b', end='', flush=True)
56
+ else:
57
+ current_line.append(char)
58
+ print(char, end='', flush=True)
59
+ print('\n\n')
60
+ return '\n'.join(user_input)
61
+
62
+
63
+
64
+ def system_update(update_message):
65
+ print(Style.RESET_ALL+ Fore.YELLOW + f"{update_message}")
66
+
67
+ def system_sub_update(update_message):
68
+ print(Style.RESET_ALL+ Fore.CYAN + f"{update_message}")
69
+
70
+ def system_output(update_message):
71
+ print(Style.RESET_ALL+ Fore.GREEN + + f"{update_message}")
72
+
73
+ def system_error(update_message):
74
+ print(Style.RESET_ALL + Fore.RED + f"{update_message}")