from config import OPENAI_MODELS, COHERE_MODELS, GROQ_MODELS, MAX_TOKENS_BASE, MAX_TOKENS_ADVANCED
import os
import gradio as gr
import requests
import logging
from openai import AzureOpenAI, OpenAI
from cohere import ClientV2
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import tool
from duckduckgo_search import DDGS
from newspaper import Article
import fitz # PyMuPDF
from io import BytesIO, StringIO
import sys
import threading
import queue
import time
# Basic logger configuration
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def setup_logging():
"""Set up logging for better error tracking."""
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Remove any existing handlers
if logger.hasHandlers():
logger.handlers.clear()
# Create a handler that writes to stdout
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s | %(levelname)-8s | %(message)s', datefmt='%H:%M:%S')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
# Global variables
TOKENS_SUMMARIZATION = 0
MODEL_CHOICE = "openai"
def export_to_markdown(result):
"""Utility to export the final result to an output.md file."""
try:
with open("outputs/output.md", "w") as file:
file.write(result)
return "outputs/output.md"
except Exception as e:
logger.error("Error exporting to markdown: %s", str(e))
return f"Error exporting: {e}"
def fetch_content(url):
"""
Fetch the content from a URL, handling either PDFs or normal web articles.
- url: The URL to fetch the content from.
"""
try:
# HEAD request to check content type
response = requests.head(url, allow_redirects=True, timeout=10)
content_type = response.headers.get('Content-Type', '').lower()
if 'application/pdf' in content_type:
# The URL points to a PDF; download and extract text
pdf_response = requests.get(url, stream=True, timeout=10)
pdf_response.raise_for_status()
pdf_file = BytesIO(pdf_response.content)
with fitz.open(stream=pdf_file, filetype="pdf") as doc:
text = ""
for page_num, page in enumerate(doc, start=1):
page_text = page.get_text()
if page_text:
text += page_text
else:
logger.warning(f"Unable to extract text from page {page_num} of the PDF.")
return text.strip()
else:
# Not a PDF; use newspaper3k’s Article to extract text
article = Article(url)
article.download()
article.parse()
return article.text
except requests.exceptions.RequestException as req_err:
logger.error("Error in the HTTP request: %s", str(req_err))
return f"Error in the HTTP request: {req_err}"
except Exception as e:
logger.error("Error getting the content: %s", str(e))
return f"Error getting the content: {e}"
# Tools
@tool('DuckDuckGoSearchResults')
def search_results(search_query: str) -> list:
"""
Performs a web search to gather and return a collection of search results with this structure:
- title: The title of the search result.
- snippet: A short snippet of the search result.
- link: The link to the search result.
"""
try:
results = DDGS().text(search_query, max_results=5, timelimit='m')
results_list = [{"title": result['title'],
"snippet": result['body'],
"link": result['href']} for result in results]
return results_list
except Exception as e:
logger.error("Error performing search: %s", str(e))
return []
@tool('WebScrapper')
def web_scrapper(url: str, topic: str) -> str:
"""
Extract and read the content of a specified link and generate a summary on a specific topic.
- url: The URL to extract the content from.
- topic: Strign with the topic to generate a summary on.
"""
global TOKENS_SUMMARIZATION
try:
content = fetch_content(url)
prompt = f"""
# OBJECTIVE
Generate an in-depth summary of the following CONTENT on the topic "{topic}"
# INSTRUCTIONS
- Provide in-depth insights based on the following CONTENT.
- If the following CONTENT is not directly related to the topic "{topic}", you MUST respond with INVALID CONTENT.
- Include insights about why the content is important for the topic, possible challenges and advances...
- The format will be markdown.
- Avoid making up anything. Every insight MUST be based on the content.
# CONTENT:
"{content}"
"""
context_messages = [
{
"role": "system",
"content": "You are an expert summarizing content for use as context. Focus on the main points."
},
{
"role": "user",
"content": str(prompt)
}
]
# Use AzureOpenAI, OpenAI or GROQ_COHERE based on model_choice
if MODEL_CHOICE == "azure":
client = AzureOpenAI(
azure_endpoint=os.getenv('AZURE_API_BASE'),
azure_deployment=os.getenv('AZURE_DEPLOYMENT_ID'),
api_key=os.getenv('AZURE_OPENAI_KEY'),
api_version=os.getenv('AZURE_API_VERSION')
)
response = client.chat.completions.create(
model=os.getenv('AZURE_DEPLOYMENT_ID'),
messages=context_messages,
temperature=0.7,
max_tokens=5000
)
elif MODEL_CHOICE == "openai":
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
response = client.chat.completions.create(
model=OPENAI_MODELS['base'],
messages=context_messages,
temperature=0.7,
max_tokens=5000
)
elif MODEL_CHOICE == "groq-cohere":
client = ClientV2(api_key=os.getenv('COHERE_API_KEY'))
response = client.chat(
model=COHERE_MODELS['advanced'],
messages=context_messages,
max_tokens=1000
)
else:
return "Error: Invalid model choice. Please select 'azure' or 'openai'."
summary = response.choices[0].message.content
if MODEL_CHOICE == "azure" or MODEL_CHOICE == "openai":
TOKENS_SUMMARIZATION += response.usage.total_tokens
elif MODEL_CHOICE == "groq-cohere":
TOKENS_SUMMARIZATION += response.usage.billed_units.input_tokens+response.usage.billed_units.output_tokens
summary_response = f"""
# SUMMARY:
{summary}
# URL: {url}
"""
# include a delay of 10 second to avoid rate limiting of GROQ-Cohere
if MODEL_CHOICE=="groq-cohere":
time.sleep(10)
return summary_response
except Exception as e:
logger.error("Error generating summary: %s", str(e))
return f"""
# SUMMARY:
Error generating summary.
IGNORE THIS OUTPUT.
# URL: {url}
"""
def capture_verbose_output(
agent_input,
model_choice,
azure_openai_key,
azure_deployment_id,
azure_api_base,
azure_api_version,
openai_api_key,
cohere_api_key,
groq_api_key
):
"""
This generator captures stdout produced by the multi-agent process in real time,
updating the Gradio interface with logs, while returning the final result once done.
"""
old_stdout = sys.stdout
mystdout = StringIO()
sys.stdout = mystdout
result_container = [None]
def run_kickoff():
result_container[0] = kickoff_crew(
topic=agent_input,
model_choice=model_choice,
azure_openai_key=azure_openai_key,
azure_deployment_id=azure_deployment_id,
azure_api_base=azure_api_base,
azure_api_version=azure_api_version,
openai_api_key=openai_api_key,
cohere_api_key=cohere_api_key,
groq_api_key=groq_api_key
)
kickoff_thread = threading.Thread(target=run_kickoff)
kickoff_thread.start()
verbose_output = ""
result_output = ""
# Initialize outputs
yield gr.update(value=result_output), gr.update(value=verbose_output)
while kickoff_thread.is_alive():
# Read new output from mystdout
new_output = mystdout.getvalue()
if new_output != verbose_output:
verbose_output = new_output
yield gr.update(value=result_output), gr.update(value=verbose_output)
time.sleep(0.1)
# Once done, get final result
kickoff_thread.join()
sys.stdout = old_stdout
result_output = result_container[0]
verbose_output = mystdout.getvalue()
yield gr.update(value=result_output), gr.update(value=verbose_output)
def kickoff_crew(
topic: str,
model_choice: str,
azure_openai_key: str,
azure_deployment_id: str,
azure_api_base: str,
azure_api_version: str,
openai_api_key: str,
cohere_api_key: str,
groq_api_key: str
) -> str:
"""
Kick off the multi-agent pipeline.
"""
try:
global TOKENS_SUMMARIZATION, MODEL_CHOICE
TOKENS_SUMMARIZATION = 0
MODEL_CHOICE = model_choice
# Basic checks
if not topic.strip():
return "Error: The topic cannot be empty. Please provide a valid topic."
# ---- Define LLMs based on the user-provided inputs ----
# Inicializa las variables de los modelos con None
azure_llm_base = None
azure_llm_advanced = None
openai_llm_base = None
openai_llm_advanced = None
groq_llm_base = None
groq_llm_advanced = None
if model_choice == "azure":
if not azure_openai_key or not azure_deployment_id or not azure_api_base or not azure_api_version:
return "Error: Please provide all the required Azure OpenAI API details."
else:
os.environ['AZURE_API_BASE']=azure_api_base
os.environ['AZURE_API_VERSION']=azure_api_version
os.environ['AZURE_DEPLOYMENT_ID']=azure_deployment_id
os.environ['AZURE_OPENAI_KEY']=azure_openai_key
# Azure
azure_llm_base = LLM(
temperature=0.3,
model=f"azure/{azure_deployment_id}",
api_key=azure_openai_key,
base_url=azure_api_base,
api_version=azure_api_version,
max_tokens=4000
)
azure_llm_advanced = LLM(
temperature=0.6,
model=f"azure/{azure_deployment_id}",
api_key=azure_openai_key,
base_url=azure_api_base,
api_version=azure_api_version,
max_tokens=10000
)
elif model_choice == "openai":
if not openai_api_key:
return "Error: Please provide the OpenAI API key."
else:
os.environ['OPENAI_API_KEY']=openai_api_key
# OpenAI
openai_llm_base = LLM(
model=OPENAI_MODELS['base'],
api_key=openai_api_key,
max_completion_tokens=4000
)
openai_llm_advanced = LLM(
model=OPENAI_MODELS['advanced'],
api_key=openai_api_key,
temperature=0.4,
max_completion_tokens=10000
)
elif model_choice == "groq-cohere":
if not cohere_api_key or not groq_api_key:
return "Error: Please provide both the Cohere and GROQ API keys."
else:
os.environ['COHERE_API_KEY']=cohere_api_key
os.environ['GROQ_API_KEY']=groq_api_key
# GROQ - placeholder examples
groq_llm_base = LLM(
model=GROQ_MODELS['base'],
api_key=groq_api_key,
temperature=0.3,
max_tokens=1000
)
groq_llm_advanced = LLM(
model=GROQ_MODELS['advanced'],
api_key=groq_api_key,
temperature=0.6,
max_tokens=4000
)
# Diccionario para agrupar los LLM
llms = {
"azure": {
"base": azure_llm_base,
"advanced": azure_llm_advanced
},
"openai": {
"base": openai_llm_base,
"advanced": openai_llm_advanced
},
"groq-cohere": {
"base": groq_llm_base,
"advanced": groq_llm_advanced
}
}
# Obtain the selected LLM set
if model_choice not in llms:
return f"Error: Invalid model choice. Please select from {list(llms.keys())}."
selected_llm = llms[model_choice]
# Define Agents
researcher = Agent(
role='Researcher',
goal=f'Search and collect detailed information on topic ## {topic} ##',
tools=[search_results, web_scrapper],
llm=selected_llm["base"],
backstory=(
"You are a meticulous researcher, skilled at navigating vast amounts of information to extract "
"essential insights on any given topic. Your dedication to detail ensures the reliability and "
"thoroughness of your findings."
),
allow_delegation=False,
max_iter=15,
max_rpm=5 if model_choice == "groq-cohere" else 120,
verbose=True
)
editor = Agent(
role='Editor',
goal=f'Compile and refine the information into a comprehensive report on topic ## {topic} ##',
llm=selected_llm["advanced"],
backstory=(
"As an expert editor, you specialize in transforming raw data into clear, engaging reports. "
"Your strong command of language and attention to detail ensure that each report not only conveys "
"essential insights but is also easily understandable to diverse audiences."
),
allow_delegation=False,
max_iter=5,
max_rpm=10 if model_choice == "groq-cohere" else 120,
verbose=True
)
# Define Tasks
research_task = Task(
description=(
"Be sure to translate the topic into English first. "
"Use the DuckDuckGoSearchResults tool to collect initial search snippets on ## {topic} ##. "
"If more detailed searches are required, generate and execute new searches related to ## {topic} ##. "
"Subsequently, employ the WebScrapper tool to extract information from significant URLs, "
"extracting further insights. Compile these findings into a preliminary draft, documenting all "
"relevant sources, titles, and links associated with the topic. "
"Ensure high accuracy throughout the process and avoid any fabrication of information."
),
expected_output=(
"A structured draft report about the topic, featuring an introduction, a detailed main body, "
"and a conclusion. Properly cite sources. Provide a thorough overview of the info gathered."
),
agent=researcher
)
edit_task = Task(
description=(
"Review and refine the initial draft report from the research task. Organize the content logically. "
"Elaborate on each section to provide in-depth information and insights. "
"Verify the accuracy of all data, correct discrepancies, update info to ensure currency, "
"and maintain a consistent tone. Include a section listing all sources used, formatted as bullet points."
),
expected_output=(
"A polished, comprehensive report on topic ## {topic} ##, with a clear, professional narrative. "
"Include an introduction, an extensive discussion, a concise conclusion, and a source list with references."
),
agent=editor,
context=[research_task]
)
# Form the Crew
crew = Crew(
agents=[researcher, editor],
tasks=[research_task, edit_task],
process=Process.sequential
)
# Kick off
result = crew.kickoff(inputs={'topic': topic})
# Compute token usage (CrewAI aggregator usage)
tokens = result.token_usage.total_tokens / 1_000
tokens_summ = TOKENS_SUMMARIZATION / 1_000
if not isinstance(result, str):
result = str(result)
result += f"\n\n**Estimated tokens (Agents):** {tokens:.5f} k"
result += f"\n\n**Estimated tokens (Summarization):** {tokens_summ:.5f} k"
return result
except Exception as e:
logger.error("Error in kickoff_crew: %s", str(e))
return f"Error in kickoff_crew: {str(e)}"
def main():
"""Set up the Gradio interface for the CrewAI Research Tool."""
description_demo = """# Automatic Insights Generation with Multi-Agents (CrewAI)
- **Multi-agent framework**: CrewAI
- **Multi-agents**: Two agents, Researcher and Editor, working together to extract information from the internet and compile a report on the topic of choice.
- **Search tool**: Duck-Duck-Go-Search
- **Web Retrieval**: Newspaper4k and PDF
*Note: Groq is currently disabled due to rate limiting issues. Please use Azure or OpenAI for now.*
"""
with gr.Blocks() as demo:
gr.Markdown(description_demo)
with gr.Row():
with gr.Column(scale=1):
# Radio: now includes azure / openai / groq / cohere
model_choice = gr.Radio(
choices=["azure", "openai", "groq-cohere"],
label="Choose Model",
value="openai",
interactive=True
)
# ------------
# LLM config inputs
# ------------
# Azure
azure_api_base_input = gr.Textbox(label="Azure API Base (url)", type="password", visible=False, interactive=True)
azure_deployment_id_input = gr.Textbox(label="Azure Deployment ID (model)", type="password", visible=False, interactive=True)
azure_openai_key_input = gr.Textbox(label="Azure API Key", type="password", visible=False, interactive=True)
azure_api_version_input = gr.Textbox(label="Azure API Version", type="text", visible=False, interactive=True)
# OpenAI
openai_api_key_input = gr.Textbox(label="OpenAI API Key", type="password", visible=True, interactive=True)
# GROQ
groq_api_key_input = gr.Textbox(label="GROQ API Key", type="password", visible=False, interactive=False)
# Cohere
cohere_api_key_input = gr.Textbox(label="Cohere API Key", type="password", visible=False, interactive=False)
export_button = gr.Button("Export to Markdown", interactive=True)
file_output = gr.File(label="Download Markdown File")
credits = gr.Markdown(
label="Credits",
show_label=True,
value="This tool is powered by [CrewAI](https://crewai.com), "
"[OpenAI](https://openai.com), "
"[Azure OpenAI Services](https://azure.microsoft.com/en-us/products/ai-services/openai-service), "
"[Cohere](https://dashboard.cohere.com), and [GROQ](https://console.groq.com/playground).",
)
with gr.Column(scale=2):
topic_input = gr.Textbox(
label="Enter Topic",
placeholder="Type here the topic of interest...",
interactive=True
)
submit_button = gr.Button("Start Research", interactive=True)
output = gr.Markdown(
label="Result",
show_copy_button=True,
value="The generated insighsts will appear here...",
latex_delimiters=[
{"left": "\\[", "right": "\\]", "display": True},
{"left": "\\(", "right": "\\)", "display": False},
]
)
verbose_output = gr.Textbox(
label="Verbose Output",
placeholder="Verbose logs will appear here...",
lines=10,
interactive=False,
show_copy_button=True
)
# ---------------
# Dynamic toggling of LLM config boxes
# ---------------
def update_model_choice(model):
"""Update visibility of config inputs based on the selected LLM."""
azure_visibility = False
openai_visibility = False
cohere_visibility = False
groq_visibility = False
if model == "azure":
azure_visibility = True
elif model == "openai":
openai_visibility = True
elif model == "groq-cohere":
cohere_visibility = True
groq_visibility = True
return {
azure_openai_key_input: gr.update(visible=azure_visibility),
azure_deployment_id_input: gr.update(visible=azure_visibility),
azure_api_base_input: gr.update(visible=azure_visibility),
azure_api_version_input: gr.update(visible=azure_visibility),
openai_api_key_input: gr.update(visible=openai_visibility),
cohere_api_key_input: gr.update(visible=cohere_visibility),
groq_api_key_input: gr.update(visible=groq_visibility),
}
model_choice.change(
fn=update_model_choice,
inputs=[model_choice],
outputs=[
azure_openai_key_input,
azure_deployment_id_input,
azure_api_base_input,
azure_api_version_input,
openai_api_key_input,
cohere_api_key_input,
groq_api_key_input
]
)
submit_button.click(
fn=capture_verbose_output,
inputs=[
topic_input,
model_choice,
azure_openai_key_input,
azure_deployment_id_input,
azure_api_base_input,
azure_api_version_input,
openai_api_key_input,
cohere_api_key_input,
groq_api_key_input
],
outputs=[output, verbose_output]
)
export_button.click(
fn=export_to_markdown,
inputs=output,
outputs=file_output
)
demo.queue(api_open=False, max_size=3).launch()
if __name__ == "__main__":
main()