|
""" |
|
π Perplexity AI Integration Examples |
|
Demonstrate how to effectively use AI-powered source discovery for dataset creation |
|
""" |
|
|
|
import os |
|
import json |
|
import time |
|
from typing import List, Dict |
|
from datetime import datetime |
|
|
|
|
|
try: |
|
from perplexity_client import PerplexityClient, SearchType, SourceResult |
|
PERPLEXITY_AVAILABLE = True |
|
except ImportError: |
|
print("β οΈ Perplexity client not available. Make sure perplexity_client.py is in the same directory.") |
|
PERPLEXITY_AVAILABLE = False |
|
|
|
def example_sentiment_analysis_sources(): |
|
""" |
|
π Example: Find sources for sentiment analysis dataset |
|
|
|
This example shows how to discover diverse sources for sentiment analysis, |
|
including product reviews, social media, and news content. |
|
""" |
|
print("π Example: Sentiment Analysis Source Discovery") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
if not client._validate_api_key(): |
|
print("β Please set PERPLEXITY_API_KEY environment variable") |
|
return |
|
|
|
|
|
projects = [ |
|
{ |
|
"description": "Product reviews from e-commerce sites for sentiment classification of customer feedback", |
|
"search_type": SearchType.GENERAL, |
|
"focus": "E-commerce reviews" |
|
}, |
|
{ |
|
"description": "Movie and entertainment reviews for sentiment analysis training with detailed ratings", |
|
"search_type": SearchType.GENERAL, |
|
"focus": "Entertainment reviews" |
|
}, |
|
{ |
|
"description": "Social media posts and comments about brands for real-time sentiment monitoring", |
|
"search_type": SearchType.SOCIAL, |
|
"focus": "Social media sentiment" |
|
}, |
|
{ |
|
"description": "News articles with opinion content for political sentiment analysis research", |
|
"search_type": SearchType.NEWS, |
|
"focus": "News opinion analysis" |
|
} |
|
] |
|
|
|
all_results = [] |
|
|
|
for i, project in enumerate(projects, 1): |
|
print(f"\nπ Project {i}: {project['focus']}") |
|
print("-" * 40) |
|
|
|
try: |
|
results = client.discover_sources( |
|
project_description=project["description"], |
|
search_type=project["search_type"], |
|
max_sources=8, |
|
include_academic=False, |
|
include_news=True |
|
) |
|
|
|
print(f"β
Found {len(results.sources)} sources in {results.search_time:.1f}s") |
|
|
|
|
|
for j, source in enumerate(results.sources[:3], 1): |
|
print(f" {j}. {source.title}") |
|
print(f" URL: {source.url}") |
|
print(f" Type: {source.source_type} | Score: {source.relevance_score:.1f}/10") |
|
print(f" Description: {source.description[:100]}...") |
|
print() |
|
|
|
all_results.extend(results.sources) |
|
|
|
if results.suggestions: |
|
print(f"π‘ Suggestions: {', '.join(results.suggestions[:3])}") |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
print(f"\nπ SUMMARY") |
|
print("-" * 40) |
|
print(f"Total sources discovered: {len(all_results)}") |
|
|
|
|
|
source_types = {} |
|
for source in all_results: |
|
source_types[source.source_type] = source_types.get(source.source_type, 0) + 1 |
|
|
|
print("Source type distribution:") |
|
for stype, count in sorted(source_types.items()): |
|
print(f" {stype}: {count} sources") |
|
|
|
|
|
domains = {} |
|
for source in all_results: |
|
domains[source.domain] = domains.get(source.domain, 0) + 1 |
|
|
|
print("\nTop domains:") |
|
for domain, count in sorted(domains.items(), key=lambda x: x[1], reverse=True)[:5]: |
|
print(f" {domain}: {count} sources") |
|
|
|
return all_results |
|
|
|
def example_text_classification_sources(): |
|
""" |
|
π Example: Find sources for text classification dataset |
|
|
|
This example demonstrates finding well-categorized content for |
|
multi-class text classification training. |
|
""" |
|
print("\nπ Example: Text Classification Source Discovery") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
|
|
project_description = """ |
|
Find diverse news articles and content with clear topical categories for training |
|
a multi-class text classifier. Need sources covering politics, technology, sports, |
|
business, entertainment, health, and science topics with consistent categorization. |
|
""" |
|
|
|
try: |
|
results = client.discover_sources( |
|
project_description=project_description, |
|
search_type=SearchType.NEWS, |
|
max_sources=15, |
|
include_academic=True, |
|
include_news=True |
|
) |
|
|
|
print(f"β
Found {len(results.sources)} sources") |
|
|
|
|
|
categorized = { |
|
"news": [], |
|
"academic": [], |
|
"business": [], |
|
"technology": [], |
|
"other": [] |
|
} |
|
|
|
for source in results.sources: |
|
domain = source.domain.lower() |
|
if any(news in domain for news in ['reuters', 'bbc', 'cnn', 'news']): |
|
categorized["news"].append(source) |
|
elif any(academic in domain for academic in ['arxiv', 'pubmed', 'scholar', 'edu']): |
|
categorized["academic"].append(source) |
|
elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business', 'financial']): |
|
categorized["business"].append(source) |
|
elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech', 'digital']): |
|
categorized["technology"].append(source) |
|
else: |
|
categorized["other"].append(source) |
|
|
|
print("\nπ Sources by Category:") |
|
for category, sources in categorized.items(): |
|
if sources: |
|
print(f"\n{category.upper()} ({len(sources)} sources):") |
|
for source in sources[:2]: |
|
print(f" β’ {source.title}") |
|
print(f" {source.url}") |
|
print(f" Score: {source.relevance_score:.1f}/10") |
|
|
|
|
|
export_data = client.export_sources(results, "json") |
|
|
|
|
|
filename = f"text_classification_sources_{int(time.time())}.json" |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
f.write(export_data) |
|
|
|
print(f"\nπ Sources exported to: {filename}") |
|
|
|
return results.sources |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
return [] |
|
|
|
def example_academic_research_sources(): |
|
""" |
|
π Example: Find academic sources for research dataset |
|
|
|
This example shows how to discover high-quality academic sources |
|
for research-focused datasets. |
|
""" |
|
print("\nπ Example: Academic Research Source Discovery") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
|
|
research_topics = [ |
|
{ |
|
"description": "Recent machine learning research papers on transformer architectures and attention mechanisms for NLP survey dataset", |
|
"domain_focus": "AI/ML research" |
|
}, |
|
{ |
|
"description": "Climate change research papers and reports for environmental science text summarization training", |
|
"domain_focus": "Climate science" |
|
}, |
|
{ |
|
"description": "Medical research papers on drug discovery and pharmaceutical research for biomedical NER training", |
|
"domain_focus": "Medical research" |
|
} |
|
] |
|
|
|
all_academic_sources = [] |
|
|
|
for topic in research_topics: |
|
print(f"\n㪠Research Topic: {topic['domain_focus']}") |
|
print("-" * 40) |
|
|
|
try: |
|
results = client.discover_sources( |
|
project_description=topic["description"], |
|
search_type=SearchType.ACADEMIC, |
|
max_sources=10, |
|
include_academic=True, |
|
include_news=False |
|
) |
|
|
|
print(f"β
Found {len(results.sources)} academic sources") |
|
|
|
|
|
high_quality = [s for s in results.sources if s.relevance_score >= 7.0] |
|
|
|
print(f"π High-quality sources (score β₯ 7.0): {len(high_quality)}") |
|
|
|
for source in high_quality[:3]: |
|
print(f"\n π {source.title}") |
|
print(f" URL: {source.url}") |
|
print(f" Domain: {source.domain}") |
|
print(f" Score: {source.relevance_score:.1f}/10") |
|
print(f" Type: {source.source_type}") |
|
|
|
all_academic_sources.extend(high_quality) |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
time.sleep(1) |
|
|
|
|
|
print(f"\nπ ACADEMIC SOURCES ANALYSIS") |
|
print("-" * 40) |
|
print(f"Total high-quality academic sources: {len(all_academic_sources)}") |
|
|
|
|
|
academic_domains = {} |
|
for source in all_academic_sources: |
|
domain = source.domain |
|
academic_domains[domain] = academic_domains.get(domain, 0) + 1 |
|
|
|
print("\nTop academic domains:") |
|
for domain, count in sorted(academic_domains.items(), key=lambda x: x[1], reverse=True)[:5]: |
|
print(f" {domain}: {count} papers") |
|
|
|
|
|
scores = [s.relevance_score for s in all_academic_sources] |
|
if scores: |
|
avg_score = sum(scores) / len(scores) |
|
print(f"\nAverage quality score: {avg_score:.1f}/10") |
|
print(f"Score range: {min(scores):.1f} - {max(scores):.1f}") |
|
|
|
return all_academic_sources |
|
|
|
def example_custom_search_strategies(): |
|
""" |
|
π― Example: Custom search strategies for specific needs |
|
|
|
This example demonstrates advanced techniques for finding |
|
very specific types of content. |
|
""" |
|
print("\nπ― Example: Custom Search Strategies") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
|
|
print("\nπ Strategy 1: Domain-specific Financial Content") |
|
print("-" * 50) |
|
|
|
try: |
|
financial_results = client.get_domain_sources( |
|
domain="bloomberg.com", |
|
topic="quarterly earnings reports and financial analysis", |
|
max_sources=5 |
|
) |
|
|
|
print(f"β
Found {len(financial_results.sources)} financial sources") |
|
for source in financial_results.sources[:2]: |
|
print(f" β’ {source.title}") |
|
print(f" Score: {source.relevance_score:.1f}/10") |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
|
|
print("\nπ Strategy 2: Keyword-based Technical Content") |
|
print("-" * 50) |
|
|
|
try: |
|
tech_keywords = ["API documentation", "software tutorials", "programming guides", "technical specifications"] |
|
tech_results = client.search_with_keywords( |
|
keywords=tech_keywords, |
|
search_type=SearchType.TECHNICAL |
|
) |
|
|
|
print(f"β
Found {len(tech_results.sources)} technical sources") |
|
for source in tech_results.sources[:2]: |
|
print(f" β’ {source.title}") |
|
print(f" Type: {source.source_type}") |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
|
|
print("\nπ Strategy 3: Multi-format Content Discovery") |
|
print("-" * 50) |
|
|
|
multiformat_description = """ |
|
Find diverse content formats including FAQ pages, interview transcripts, |
|
tutorial content, and documentation for question-answering dataset creation. |
|
Need sources with clear question-answer patterns and structured information. |
|
""" |
|
|
|
try: |
|
qa_results = client.discover_sources( |
|
project_description=multiformat_description, |
|
search_type=SearchType.GENERAL, |
|
max_sources=12 |
|
) |
|
|
|
print(f"β
Found {len(qa_results.sources)} Q&A sources") |
|
|
|
|
|
formats = { |
|
"faq": [], |
|
"tutorial": [], |
|
"documentation": [], |
|
"interview": [], |
|
"other": [] |
|
} |
|
|
|
for source in qa_results.sources: |
|
title_lower = source.title.lower() |
|
url_lower = source.url.lower() |
|
|
|
if any(faq in title_lower or faq in url_lower for faq in ['faq', 'questions', 'help']): |
|
formats["faq"].append(source) |
|
elif any(tut in title_lower for tut in ['tutorial', 'guide', 'how to']): |
|
formats["tutorial"].append(source) |
|
elif any(doc in title_lower or doc in url_lower for doc in ['docs', 'documentation', 'manual']): |
|
formats["documentation"].append(source) |
|
elif any(int in title_lower for int in ['interview', 'q&a', 'conversation']): |
|
formats["interview"].append(source) |
|
else: |
|
formats["other"].append(source) |
|
|
|
for format_type, sources in formats.items(): |
|
if sources: |
|
print(f"\n {format_type.upper()}: {len(sources)} sources") |
|
if sources: |
|
best = max(sources, key=lambda x: x.relevance_score) |
|
print(f" Best: {best.title} (Score: {best.relevance_score:.1f})") |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
def example_quality_assessment(): |
|
""" |
|
β
Example: Quality assessment and source validation |
|
|
|
This example shows how to evaluate and filter sources |
|
for maximum dataset quality. |
|
""" |
|
print("\nβ
Example: Source Quality Assessment") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
|
|
description = "Content for machine learning training including text classification and sentiment analysis" |
|
|
|
try: |
|
results = client.discover_sources( |
|
project_description=description, |
|
search_type=SearchType.GENERAL, |
|
max_sources=20 |
|
) |
|
|
|
print(f"β
Found {len(results.sources)} total sources") |
|
|
|
|
|
print(f"\nπ QUALITY DISTRIBUTION") |
|
print("-" * 40) |
|
|
|
quality_tiers = { |
|
"excellent": [s for s in results.sources if s.relevance_score >= 8.0], |
|
"good": [s for s in results.sources if 6.0 <= s.relevance_score < 8.0], |
|
"acceptable": [s for s in results.sources if 4.0 <= s.relevance_score < 6.0], |
|
"poor": [s for s in results.sources if s.relevance_score < 4.0] |
|
} |
|
|
|
for tier, sources in quality_tiers.items(): |
|
print(f"{tier.upper()}: {len(sources)} sources") |
|
if sources: |
|
avg_score = sum(s.relevance_score for s in sources) / len(sources) |
|
print(f" Average score: {avg_score:.1f}") |
|
print(f" Example: {sources[0].title[:50]}...") |
|
|
|
|
|
print(f"\nπ VALIDATING TOP SOURCES") |
|
print("-" * 40) |
|
|
|
top_sources = [s for s in results.sources if s.relevance_score >= 7.0] |
|
validated_sources = client.validate_sources(top_sources) |
|
|
|
print(f"Sources passed validation: {len(validated_sources)}/{len(top_sources)}") |
|
|
|
|
|
for source in validated_sources[:3]: |
|
print(f"\nβ
VALIDATED: {source.title}") |
|
print(f" URL: {source.url}") |
|
print(f" Domain: {source.domain}") |
|
print(f" Type: {source.source_type}") |
|
print(f" Score: {source.relevance_score:.1f}/10") |
|
print(f" Description: {source.description[:100]}...") |
|
|
|
|
|
if validated_sources: |
|
export_data = { |
|
"search_query": description, |
|
"total_found": len(results.sources), |
|
"validated_count": len(validated_sources), |
|
"quality_threshold": 7.0, |
|
"sources": [ |
|
{ |
|
"url": s.url, |
|
"title": s.title, |
|
"domain": s.domain, |
|
"type": s.source_type, |
|
"score": s.relevance_score, |
|
"description": s.description |
|
} |
|
for s in validated_sources |
|
] |
|
} |
|
|
|
filename = f"validated_sources_{int(time.time())}.json" |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(export_data, f, indent=2) |
|
|
|
print(f"\nπ Validated sources exported to: {filename}") |
|
|
|
return validated_sources |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
return [] |
|
|
|
def example_batch_processing(): |
|
""" |
|
β‘ Example: Batch processing for large dataset projects |
|
|
|
This example demonstrates efficient batch discovery for |
|
large-scale dataset creation projects. |
|
""" |
|
print("\nβ‘ Example: Batch Processing for Large Projects") |
|
print("=" * 60) |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Perplexity client not available") |
|
return |
|
|
|
client = PerplexityClient() |
|
|
|
|
|
batch_searches = [ |
|
{ |
|
"name": "E-commerce Reviews", |
|
"description": "Product reviews from online stores for sentiment analysis", |
|
"search_type": SearchType.GENERAL, |
|
"max_sources": 8 |
|
}, |
|
{ |
|
"name": "Social Media Content", |
|
"description": "Social media posts and comments for sentiment classification", |
|
"search_type": SearchType.SOCIAL, |
|
"max_sources": 8 |
|
}, |
|
{ |
|
"name": "News Opinion", |
|
"description": "News articles with editorial content for opinion mining", |
|
"search_type": SearchType.NEWS, |
|
"max_sources": 8 |
|
}, |
|
{ |
|
"name": "Forum Discussions", |
|
"description": "Forum posts and community discussions for sentiment analysis", |
|
"search_type": SearchType.GENERAL, |
|
"max_sources": 6 |
|
} |
|
] |
|
|
|
all_batch_results = [] |
|
total_start_time = time.time() |
|
|
|
print(f"π Processing {len(batch_searches)} batch searches...") |
|
|
|
for i, search in enumerate(batch_searches, 1): |
|
print(f"\nπ Batch {i}/{len(batch_searches)}: {search['name']}") |
|
print("-" * 40) |
|
|
|
search_start = time.time() |
|
|
|
try: |
|
results = client.discover_sources( |
|
project_description=search["description"], |
|
search_type=search["search_type"], |
|
max_sources=search["max_sources"] |
|
) |
|
|
|
search_time = time.time() - search_start |
|
|
|
print(f"β
Found {len(results.sources)} sources in {search_time:.1f}s") |
|
|
|
|
|
for source in results.sources: |
|
source.batch_name = search["name"] |
|
source.batch_index = i |
|
|
|
all_batch_results.extend(results.sources) |
|
|
|
|
|
if results.sources: |
|
best = max(results.sources, key=lambda x: x.relevance_score) |
|
print(f" Top result: {best.title} (Score: {best.relevance_score:.1f})") |
|
|
|
except Exception as e: |
|
print(f"β Batch {i} failed: {e}") |
|
|
|
|
|
time.sleep(1.5) |
|
|
|
total_time = time.time() - total_start_time |
|
|
|
|
|
print(f"\nπ BATCH PROCESSING RESULTS") |
|
print("-" * 40) |
|
print(f"Total sources discovered: {len(all_batch_results)}") |
|
print(f"Total processing time: {total_time:.1f} seconds") |
|
print(f"Average per batch: {total_time/len(batch_searches):.1f} seconds") |
|
|
|
|
|
batch_stats = {} |
|
for source in all_batch_results: |
|
batch_name = getattr(source, 'batch_name', 'unknown') |
|
if batch_name not in batch_stats: |
|
batch_stats[batch_name] = { |
|
'count': 0, |
|
'avg_score': 0, |
|
'scores': [] |
|
} |
|
|
|
batch_stats[batch_name]['count'] += 1 |
|
batch_stats[batch_name]['scores'].append(source.relevance_score) |
|
|
|
|
|
for batch_name, stats in batch_stats.items(): |
|
if stats['scores']: |
|
stats['avg_score'] = sum(stats['scores']) / len(stats['scores']) |
|
|
|
print(f"\nBatch quality comparison:") |
|
for batch_name, stats in sorted(batch_stats.items(), key=lambda x: x[1]['avg_score'], reverse=True): |
|
print(f" {batch_name}: {stats['count']} sources, avg score {stats['avg_score']:.1f}") |
|
|
|
|
|
batch_export = { |
|
"project_name": "Large Scale Sentiment Analysis Dataset", |
|
"batch_processing_date": datetime.now().isoformat(), |
|
"total_sources": len(all_batch_results), |
|
"processing_time_seconds": total_time, |
|
"batches": len(batch_searches), |
|
"batch_statistics": batch_stats, |
|
"sources": [ |
|
{ |
|
"url": s.url, |
|
"title": s.title, |
|
"domain": s.domain, |
|
"type": s.source_type, |
|
"score": s.relevance_score, |
|
"batch": getattr(s, 'batch_name', 'unknown'), |
|
"description": s.description |
|
} |
|
for s in all_batch_results |
|
] |
|
} |
|
|
|
filename = f"batch_results_{int(time.time())}.json" |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(batch_export, f, indent=2) |
|
|
|
print(f"\nπ Batch results exported to: {filename}") |
|
print(f"π‘ Use these {len(all_batch_results)} sources to create a comprehensive sentiment analysis dataset!") |
|
|
|
return all_batch_results |
|
|
|
def main(): |
|
""" |
|
π Run all Perplexity AI examples |
|
|
|
This function demonstrates the full range of capabilities |
|
for AI-powered source discovery. |
|
""" |
|
print("π Perplexity AI Integration - Complete Examples") |
|
print("=" * 70) |
|
print("These examples show how to use AI-powered source discovery") |
|
print("to create high-quality datasets efficiently.\n") |
|
|
|
if not PERPLEXITY_AVAILABLE: |
|
print("β Cannot run examples - perplexity_client.py not found") |
|
print("Please ensure the perplexity_client.py file is in the same directory.") |
|
return |
|
|
|
if not os.getenv('PERPLEXITY_API_KEY'): |
|
print("β Cannot run examples - PERPLEXITY_API_KEY not set") |
|
print("Please set your Perplexity API key as an environment variable:") |
|
print("export PERPLEXITY_API_KEY='your_api_key_here'") |
|
return |
|
|
|
print("β
Perplexity AI client available and configured") |
|
print("π― Running comprehensive examples...\n") |
|
|
|
try: |
|
|
|
sentiment_sources = example_sentiment_analysis_sources() |
|
time.sleep(2) |
|
|
|
classification_sources = example_text_classification_sources() |
|
time.sleep(2) |
|
|
|
academic_sources = example_academic_research_sources() |
|
time.sleep(2) |
|
|
|
example_custom_search_strategies() |
|
time.sleep(2) |
|
|
|
validated_sources = example_quality_assessment() |
|
time.sleep(2) |
|
|
|
batch_sources = example_batch_processing() |
|
|
|
|
|
print(f"\nπ EXAMPLES COMPLETE!") |
|
print("=" * 70) |
|
print("Summary of discovered sources:") |
|
|
|
total_sources = 0 |
|
if sentiment_sources: |
|
total_sources += len(sentiment_sources) |
|
print(f" π Sentiment Analysis: {len(sentiment_sources)} sources") |
|
|
|
if classification_sources: |
|
total_sources += len(classification_sources) |
|
print(f" π Text Classification: {len(classification_sources)} sources") |
|
|
|
if academic_sources: |
|
total_sources += len(academic_sources) |
|
print(f" π Academic Research: {len(academic_sources)} sources") |
|
|
|
if validated_sources: |
|
print(f" β
Validated High-Quality: {len(validated_sources)} sources") |
|
|
|
if batch_sources: |
|
print(f" β‘ Batch Processing: {len(batch_sources)} sources") |
|
|
|
print(f"\nπ― Total unique sources discovered: {total_sources}") |
|
print("π Check the generated JSON files for detailed source information") |
|
print("\nπ‘ Next steps:") |
|
print(" 1. Review the exported source files") |
|
print(" 2. Select the best sources for your specific use case") |
|
print(" 3. Use these sources in your AI Dataset Studio") |
|
print(" 4. Create amazing datasets with AI-powered discovery!") |
|
|
|
except Exception as e: |
|
print(f"β Error running examples: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
|
|
if __name__ == "__main__": |
|
main() |