Spaces:

anindya-hf-2002
/

Research-and-RAG-Assistant

Sleeping

App Files Files Community

Research-and-RAG-Assistant / src /agents /research_agent.py

anindya-hf-2002

Upload 19 files

db17bc0 verified 6 months ago

raw

history blame contribute delete

17.1 kB

	from langgraph.graph import END, StateGraph, START
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_openai import ChatOpenAI

	import re
	import asyncio
	from typing import TypedDict, List, Optional, Dict
	from src.tools.deep_crawler import DeepWebCrawler, ResourceCollectionAgent

	class ResearchGraphState(TypedDict):
	company: str
	industry: str
	research_results: Optional[dict]
	use_cases: Optional[str]
	search_queries: Optional[Dict[str, List[str]]]
	resources: Optional[List[dict]]
	final_report: Optional[str]


	def clean_text(text):
	"""
	Cleans the given text by:
	1. Removing all hyperlinks.
	2. Removing unnecessary parentheses and square brackets.

	Args:
	text (str): The input text to be cleaned.

	Returns:
	str: The cleaned text with hyperlinks, parentheses, and square brackets removed.
	"""
	# Regular expression pattern for matching URLs
	url_pattern = r'https?://\S+\|www\.\S+'
	# Remove hyperlinks
	text_without_links = re.sub(url_pattern, '', text)

	# Regular expression pattern for matching parentheses and square brackets
	brackets_pattern = r'[\[\]\(\)]'
	# Remove unnecessary brackets
	cleaned_text = re.sub(brackets_pattern, '', text_without_links)

	return cleaned_text.strip()


	def create_industry_research_workflow(llm):
	async def industry_research(state: ResearchGraphState):
	"""Research industry and company using DeepWebCrawler."""
	company = state['company']
	industry = state['industry']

	queries = [
	f"{company} company profile services",
	]

	crawler = DeepWebCrawler(
	max_search_results=3,
	max_external_links=1,
	word_count_threshold=100,
	content_filter_type='bm25',
	filter_threshold=0.48
	)

	all_results = []
	for query in queries:
	results = await crawler.search_and_crawl(query)
	all_results.extend(results)
	print(all_results)
	combined_content = "\n\n".join([
	f"Title: {clean_text(r['title'])} \n{clean_text(r['content'])}"
	for r in all_results if r['success']
	])
	print("Combined Content: ", combined_content)
	prompt = PromptTemplate.from_template(
	"""Analyze this research about {company} in the {industry} industry:
	{content}

	Provide a comprehensive overview including:
	1. Company Overview
	2. Market Segments
	3. Products and Services
	4. Strategic Focus Areas
	5. Industry Trends
	6. Competitive Position

	Format the analysis in clear sections with headers."""
	)

	chain = prompt \| llm \| StrOutputParser()
	analysis = chain.invoke({
	"company": company,
	"industry": industry,
	"content": combined_content
	})
	print("Analysis: ", analysis)
	return {
	"research_results": {
	"analysis": analysis,
	"raw_content": combined_content
	}
	}

	def generate_use_cases_and_queries(state: ResearchGraphState):
	"""Generate AI/ML use cases and extract relevant search queries."""
	research_data = state['research_results']
	company = state['company']
	industry = state['industry']

	# First generate use cases
	use_case_prompt = PromptTemplate.from_template(
	"""Based on this research:

	Analysis: {analysis}
	Raw Research: {raw_content}

	Generate innovative use cases where {company} in the {industry} industry can leverage
	Generative AI and Large Language Models for:

	1. Internal Process Improvements
	2. Customer Experience Enhancement
	3. Product/Service Innovation
	4. Data Analytics and Decision Making

	For each use case, provide:
	- Clear description
	- Expected benefits
	- Implementation considerations"""
	)

	chain = use_case_prompt \| llm \| StrOutputParser()
	use_cases = chain.invoke({
	"company": company,
	"industry": industry,
	"analysis": research_data['analysis'],
	"raw_content": research_data['raw_content']
	})

	# Then extract relevant search queries
	query_extraction_prompt = PromptTemplate.from_template(
	"""Based on these AI/ML use cases for {company}:

	{use_cases}

	Extract Two specific search queries for finding relevant datasets and implementations.

	Provide your response in this exact format:
	DATASET QUERIES:
	- query1
	- query2

	IMPLEMENTATION QUERIES:
	- query1
	- query2

	Make queries specific and technical. Include ML model types, data types, and specific AI techniques."""
	)

	chain = query_extraction_prompt \| llm \| StrOutputParser()
	queries_text = chain.invoke({
	"company": company,
	"use_cases": use_cases
	})

	# Parse the text response into structured format
	def parse_queries(text):
	dataset_queries = []
	implementation_queries = []
	current_section = None

	for line in text.split('\n'):
	line = line.strip()
	if line == "DATASET QUERIES:":
	current_section = "dataset"
	elif line == "IMPLEMENTATION QUERIES:":
	current_section = "implementation"
	elif line.startswith("- "):
	query = line[2:].strip()
	if current_section == "dataset":
	dataset_queries.append(query)
	elif current_section == "implementation":
	implementation_queries.append(query)

	return {
	"dataset_queries": dataset_queries or ["machine learning datasets business", "AI training data industry"],
	"implementation_queries": implementation_queries or ["AI tools business automation", "machine learning implementation"]
	}

	search_queries = parse_queries(queries_text)
	print("Search_queries: ", search_queries)
	return {
	"use_cases": use_cases,
	"search_queries": search_queries
	}

	async def collect_targeted_resources(state: ResearchGraphState):
	"""Find relevant datasets and resources using extracted queries."""
	search_queries = state['search_queries']
	resource_agent = ResourceCollectionAgent(max_results_per_query=5)

	# Collect resources using targeted queries
	all_resources = {
	"datasets": [],
	"implementations": []
	}

	# Search for datasets
	for query in search_queries['dataset_queries']:
	# Add platform-specific modifiers to queries
	kaggle_query = f"site:kaggle.com/datasets {query}"
	huggingface_query = f"site:huggingface.co/datasets {query}"

	resources = await resource_agent.collect_resources()

	# Process and categorize results
	if resources.get("kaggle_datasets"):
	all_resources["datasets"].extend([{
	"title": item["title"],
	"url": item["url"],
	"description": item["snippet"],
	"platform": "Kaggle",
	"query": query
	} for item in resources["kaggle_datasets"]])

	if resources.get("huggingface_datasets"):
	all_resources["datasets"].extend([{
	"title": item["title"],
	"url": item["url"],
	"description": item["snippet"],
	"platform": "HuggingFace",
	"query": query
	} for item in resources["huggingface_datasets"]])

	# Search for implementations
	for query in search_queries['implementation_queries']:
	github_query = f"site:github.com {query}"

	resources = await resource_agent.collect_resources()

	if resources.get("github_repositories"):
	all_resources["implementations"].extend([{
	"title": item["title"],
	"url": item["url"],
	"description": item["snippet"],
	"platform": "GitHub",
	"query": query
	} for item in resources["github_repositories"]])
	print("Resources: ", all_resources)
	return {"resources": all_resources}

	def generate_pdf_report(state: ResearchGraphState):
	"""Generate final PDF report with all collected information."""
	research_data = state['research_results']
	use_cases = state['use_cases']
	resources = state['resources']
	company = state['company']
	industry = state['industry']

	# Format resources for manual append later
	datasets_section = "\n## Available Datasets\n"
	if resources.get('datasets'):
	for dataset in resources['datasets']:
	datasets_section += f" - {dataset['platform']}: {dataset['url']}\n"

	implementations_section = "\n## Implementation Resources\n"
	if resources.get('implementations'):
	for impl in resources['implementations']:
	implementations_section += f" - {impl['platform']}: {impl['url']}\n"


	prompt = PromptTemplate.from_template(
	"""
	# GenAI & ML Implementation Proposal for {company}

	## Executive Summary
	- Current Position in the {industry} Industry:
	- Key Opportunities for AI/ML Implementation:
	- Expected Business Impact and ROI:
	- Implementation Timeline Overview:

	## Industry and Company Analysis
	{analysis}

	## Strategic AI/ML Implementation Opportunities

	Based on the analysis, here are the key opportunities for AI/ML implementation:

	{use_cases}

	Format the report in Markdown for clear sections, headings, and bullet points. Ensure professional formatting with structured subsections.
	"""
	)

	chain = prompt \| llm \| StrOutputParser()
	markdown_content = chain.invoke({
	"company": company,
	"industry": industry,
	"analysis": research_data['analysis'],
	"use_cases": use_cases,
	})

	if markdown_content.startswith("```markdown") and markdown_content.endswith("```"):
	markdown_content = markdown_content[len("```markdown"):].rstrip("```").strip()

	markdown_content += "\n\n" + datasets_section + "\n\n" + implementations_section
	# Convert markdown to PDF
	import tempfile
	import os
	import markdown2
	from xhtml2pdf import pisa

	# Create temporary directory and full path for PDF
	temp_dir = tempfile.mkdtemp()
	pdf_filename = f"{company.replace(' ', '_')}_research_report.pdf"
	pdf_path = os.path.join(temp_dir, pdf_filename)

	html_content = markdown2.markdown(markdown_content, extras=['tables', 'break-on-newline'])
	# HTML template with enhanced styles (same as before)
	html_template = f"""
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<style>
	@page {{
	size: A4;
	margin: 2.5cm;
	@frame footer {{
	-pdf-frame-content: footerContent;
	bottom: 1cm;
	margin-left: 1cm;
	margin-right: 1cm;
	height: 1cm;
	}}
	}}
	body {{
	font-family: Helvetica, Arial, sans-serif;
	font-size: 11pt;
	line-height: 1.6;
	color: #2c3e50;
	}}
	h1 {{
	font-size: 24pt;
	color: #1a237e;
	text-align: center;
	margin-bottom: 2cm;
	font-weight: bold;
	}}
	h2 {{
	font-size: 18pt;
	color: #283593;
	margin-top: 1.5cm;
	border-bottom: 2px solid #3949ab;
	padding-bottom: 0.3cm;
	}}
	h3 {{
	font-size: 14pt;
	color: #3949ab;
	margin-top: 1cm;
	}}
	h4 {{
	font-size: 12pt;
	color: #5c6bc0;
	margin-top: 0.8cm;
	}}
	p {{
	text-align: justify;
	margin-bottom: 0.5cm;
	}}
	ul {{
	margin-left: 0;
	padding-left: 1cm;
	margin-bottom: 0.5cm;
	}}
	li {{
	margin-bottom: 0.3cm;
	}}
	a {{
	color: #3f51b5;
	text-decoration: none;
	}}
	strong {{
	color: #283593;
	}}
	.use-case {{
	background-color: #f5f7fa;
	padding: 1cm;
	margin: 0.5cm 0;
	border-left: 4px solid #3949ab;
	}}
	.benefit {{
	margin-left: 1cm;
	color: #34495e;
	}}
	</style>
	</head>
	<body>
	{html_content}
	<div id="footerContent" style="text-align: center; font-size: 8pt; color: #7f8c8d;">
	Page <pdf:pagenumber> of <pdf:pagecount>
	</div>
	</body>
	</html>
	"""

	# Convert HTML to PDF with proper error handling
	try:
	with open(pdf_path, "w+b") as pdf_file:
	result = pisa.CreatePDF(
	html_template,
	dest=pdf_file
	)
	if result.err:
	print(f"Error generating PDF: {result.err}")
	return {"final_report": None}

	# Verify the file exists
	if os.path.exists(pdf_path):
	print(f"PDF successfully generated at: {pdf_path}")
	return {"final_report": pdf_path}
	else:
	print("PDF file was not created successfully")
	return {"final_report": None}

	except Exception as e:
	print(f"Exception during PDF generation: {str(e)}")
	return {"final_report": None}

	# Create workflow
	workflow = StateGraph(ResearchGraphState)

	# Add nodes
	workflow.add_node("industry_research", industry_research)
	workflow.add_node("use_cases_gen", generate_use_cases_and_queries)
	workflow.add_node("resources_gen", collect_targeted_resources)
	workflow.add_node("report", generate_pdf_report)

	# Define edges
	workflow.add_edge(START, "industry_research")
	workflow.add_edge("industry_research", "use_cases_gen")
	workflow.add_edge("use_cases_gen", "resources_gen")
	workflow.add_edge("resources_gen", "report")
	workflow.add_edge("report", END)

	return workflow.compile()

	async def run_industry_research(company: str, industry: str, llm):
	"""Run the industry research workflow asynchronously."""
	workflow = create_industry_research_workflow(llm)

	final_state = None
	output = await workflow.ainvoke(input={
	"company": company,
	"industry": industry
	}, config={"recursion_limit": 5})

	return output['final_report']

	# Example usage
	if __name__ == "__main__":
	async def main():
	# Initialize LLM
	llm = ChatOpenAI(
	model="gpt-3.5-turbo-0125",
	temperature=0.3,
	timeout=None,
	max_retries=2,)

	# Run the workflow
	report_path = await run_industry_research(
	company="Adani Defence & Aerospace",
	industry="Defense Engineering and Construction",
	llm=llm
	)
	print(f"Report generated at: {report_path}")

	asyncio.run(main())