Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Running

App Files Files Community

MagicMeWizard commited on Jul 1

Commit

c3b493b

verified ·

1 Parent(s): dbedabb

Create examples.py

Browse files

Files changed (1) hide show

examples.py +731 -0

examples.py ADDED Viewed

	@@ -0,0 +1,731 @@

+"""
+📚 Perplexity AI Integration Examples
+Demonstrate how to effectively use AI-powered source discovery for dataset creation
+"""
+import os
+import json
+import time
+from typing import List, Dict
+from datetime import datetime
+# Import our Perplexity client
+try:
+    from perplexity_client import PerplexityClient, SearchType, SourceResult
+    PERPLEXITY_AVAILABLE = True
+except ImportError:
+    print("⚠️ Perplexity client not available. Make sure perplexity_client.py is in the same directory.")
+    PERPLEXITY_AVAILABLE = False
+def example_sentiment_analysis_sources():
+    """
+    📊 Example: Find sources for sentiment analysis dataset
+    This example shows how to discover diverse sources for sentiment analysis,
+    including product reviews, social media, and news content.
+    """
+    print("📊 Example: Sentiment Analysis Source Discovery")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    if not client._validate_api_key():
+        print("❌ Please set PERPLEXITY_API_KEY environment variable")
+        return
+    # Different types of sentiment analysis projects
+    projects = [
+        {
+            "description": "Product reviews from e-commerce sites for sentiment classification of customer feedback",
+            "search_type": SearchType.GENERAL,
+            "focus": "E-commerce reviews"
+        },
+        {
+            "description": "Movie and entertainment reviews for sentiment analysis training with detailed ratings",
+            "search_type": SearchType.GENERAL,
+            "focus": "Entertainment reviews"
+        },
+        {
+            "description": "Social media posts and comments about brands for real-time sentiment monitoring",
+            "search_type": SearchType.SOCIAL,
+            "focus": "Social media sentiment"
+        },
+        {
+            "description": "News articles with opinion content for political sentiment analysis research",
+            "search_type": SearchType.NEWS,
+            "focus": "News opinion analysis"
+        }
+    ]
+    all_results = []
+    for i, project in enumerate(projects, 1):
+        print(f"\n🔍 Project {i}: {project['focus']}")
+        print("-" * 40)
+        try:
+            results = client.discover_sources(
+                project_description=project["description"],
+                search_type=project["search_type"],
+                max_sources=8,
+                include_academic=False,  # Focus on practical sources
+                include_news=True
+            )
+            print(f"✅ Found {len(results.sources)} sources in {results.search_time:.1f}s")
+            # Show top 3 sources
+            for j, source in enumerate(results.sources[:3], 1):
+                print(f"  {j}. {source.title}")
+                print(f"     URL: {source.url}")
+                print(f"     Type: {source.source_type} | Score: {source.relevance_score:.1f}/10")
+                print(f"     Description: {source.description[:100]}...")
+                print()
+            all_results.extend(results.sources)
+            if results.suggestions:
+                print(f"💡 Suggestions: {', '.join(results.suggestions[:3])}")
+        except Exception as e:
+            print(f"❌ Error: {e}")
+        # Respectful delay between requests
+        time.sleep(1)
+    # Summary
+    print(f"\n📊 SUMMARY")
+    print("-" * 40)
+    print(f"Total sources discovered: {len(all_results)}")
+    # Analyze source types
+    source_types = {}
+    for source in all_results:
+        source_types[source.source_type] = source_types.get(source.source_type, 0) + 1
+    print("Source type distribution:")
+    for stype, count in sorted(source_types.items()):
+        print(f"  {stype}: {count} sources")
+    # Top domains
+    domains = {}
+    for source in all_results:
+        domains[source.domain] = domains.get(source.domain, 0) + 1
+    print("\nTop domains:")
+    for domain, count in sorted(domains.items(), key=lambda x: x[1], reverse=True)[:5]:
+        print(f"  {domain}: {count} sources")
+    return all_results
+def example_text_classification_sources():
+    """
+    📂 Example: Find sources for text classification dataset
+    This example demonstrates finding well-categorized content for
+    multi-class text classification training.
+    """
+    print("\n📂 Example: Text Classification Source Discovery")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    # Multi-domain classification project
+    project_description = """
+    Find diverse news articles and content with clear topical categories for training
+    a multi-class text classifier. Need sources covering politics, technology, sports,
+    business, entertainment, health, and science topics with consistent categorization.
+    """
+    try:
+        results = client.discover_sources(
+            project_description=project_description,
+            search_type=SearchType.NEWS,
+            max_sources=15,
+            include_academic=True,  # Include academic sources for science topics
+            include_news=True
+        )
+        print(f"✅ Found {len(results.sources)} sources")
+        # Categorize sources by likely content type
+        categorized = {
+            "news": [],
+            "academic": [],
+            "business": [],
+            "technology": [],
+            "other": []
+        }
+        for source in results.sources:
+            domain = source.domain.lower()
+            if any(news in domain for news in ['reuters', 'bbc', 'cnn', 'news']):
+                categorized["news"].append(source)
+            elif any(academic in domain for academic in ['arxiv', 'pubmed', 'scholar', 'edu']):
+                categorized["academic"].append(source)
+            elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business', 'financial']):
+                categorized["business"].append(source)
+            elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech', 'digital']):
+                categorized["technology"].append(source)
+            else:
+                categorized["other"].append(source)
+        print("\n📋 Sources by Category:")
+        for category, sources in categorized.items():
+            if sources:
+                print(f"\n{category.upper()} ({len(sources)} sources):")
+                for source in sources[:2]:  # Show top 2 per category
+                    print(f"  • {source.title}")
+                    print(f"    {source.url}")
+                    print(f"    Score: {source.relevance_score:.1f}/10")
+        # Export for use
+        export_data = client.export_sources(results, "json")
+        # Save to file
+        filename = f"text_classification_sources_{int(time.time())}.json"
+        with open(filename, 'w', encoding='utf-8') as f:
+            f.write(export_data)
+        print(f"\n📄 Sources exported to: {filename}")
+        return results.sources
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return []
+def example_academic_research_sources():
+    """
+    🎓 Example: Find academic sources for research dataset
+    This example shows how to discover high-quality academic sources
+    for research-focused datasets.
+    """
+    print("\n🎓 Example: Academic Research Source Discovery")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    # Research-focused projects
+    research_topics = [
+        {
+            "description": "Recent machine learning research papers on transformer architectures and attention mechanisms for NLP survey dataset",
+            "domain_focus": "AI/ML research"
+        },
+        {
+            "description": "Climate change research papers and reports for environmental science text summarization training",
+            "domain_focus": "Climate science"
+        },
+        {
+            "description": "Medical research papers on drug discovery and pharmaceutical research for biomedical NER training",
+            "domain_focus": "Medical research"
+        }
+    ]
+    all_academic_sources = []
+    for topic in research_topics:
+        print(f"\n🔬 Research Topic: {topic['domain_focus']}")
+        print("-" * 40)
+        try:
+            results = client.discover_sources(
+                project_description=topic["description"],
+                search_type=SearchType.ACADEMIC,
+                max_sources=10,
+                include_academic=True,
+                include_news=False  # Focus on academic sources only
+            )
+            print(f"✅ Found {len(results.sources)} academic sources")
+            # Filter for high-quality academic sources
+            high_quality = [s for s in results.sources if s.relevance_score >= 7.0]
+            print(f"📚 High-quality sources (score ≥ 7.0): {len(high_quality)}")
+            for source in high_quality[:3]:
+                print(f"\n  📄 {source.title}")
+                print(f"      URL: {source.url}")
+                print(f"      Domain: {source.domain}")
+                print(f"      Score: {source.relevance_score:.1f}/10")
+                print(f"      Type: {source.source_type}")
+            all_academic_sources.extend(high_quality)
+        except Exception as e:
+            print(f"❌ Error: {e}")
+        time.sleep(1)  # Respectful delay
+    # Analysis
+    print(f"\n📊 ACADEMIC SOURCES ANALYSIS")
+    print("-" * 40)
+    print(f"Total high-quality academic sources: {len(all_academic_sources)}")
+    # Domain analysis
+    academic_domains = {}
+    for source in all_academic_sources:
+        domain = source.domain
+        academic_domains[domain] = academic_domains.get(domain, 0) + 1
+    print("\nTop academic domains:")
+    for domain, count in sorted(academic_domains.items(), key=lambda x: x[1], reverse=True)[:5]:
+        print(f"  {domain}: {count} papers")
+    # Quality distribution
+    scores = [s.relevance_score for s in all_academic_sources]
+    if scores:
+        avg_score = sum(scores) / len(scores)
+        print(f"\nAverage quality score: {avg_score:.1f}/10")
+        print(f"Score range: {min(scores):.1f} - {max(scores):.1f}")
+    return all_academic_sources
+def example_custom_search_strategies():
+    """
+    🎯 Example: Custom search strategies for specific needs
+    This example demonstrates advanced techniques for finding
+    very specific types of content.
+    """
+    print("\n🎯 Example: Custom Search Strategies")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    # Strategy 1: Domain-specific search
+    print("\n🔍 Strategy 1: Domain-specific Financial Content")
+    print("-" * 50)
+    try:
+        financial_results = client.get_domain_sources(
+            domain="bloomberg.com",
+            topic="quarterly earnings reports and financial analysis",
+            max_sources=5
+        )
+        print(f"✅ Found {len(financial_results.sources)} financial sources")
+        for source in financial_results.sources[:2]:
+            print(f"  • {source.title}")
+            print(f"    Score: {source.relevance_score:.1f}/10")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    # Strategy 2: Keyword-based search
+    print("\n🔍 Strategy 2: Keyword-based Technical Content")
+    print("-" * 50)
+    try:
+        tech_keywords = ["API documentation", "software tutorials", "programming guides", "technical specifications"]
+        tech_results = client.search_with_keywords(
+            keywords=tech_keywords,
+            search_type=SearchType.TECHNICAL
+        )
+        print(f"✅ Found {len(tech_results.sources)} technical sources")
+        for source in tech_results.sources[:2]:
+            print(f"  • {source.title}")
+            print(f"    Type: {source.source_type}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    # Strategy 3: Multi-format search
+    print("\n🔍 Strategy 3: Multi-format Content Discovery")
+    print("-" * 50)
+    multiformat_description = """
+    Find diverse content formats including FAQ pages, interview transcripts,
+    tutorial content, and documentation for question-answering dataset creation.
+    Need sources with clear question-answer patterns and structured information.
+    """
+    try:
+        qa_results = client.discover_sources(
+            project_description=multiformat_description,
+            search_type=SearchType.GENERAL,
+            max_sources=12
+        )
+        print(f"✅ Found {len(qa_results.sources)} Q&A sources")
+        # Categorize by content format
+        formats = {
+            "faq": [],
+            "tutorial": [],
+            "documentation": [],
+            "interview": [],
+            "other": []
+        }
+        for source in qa_results.sources:
+            title_lower = source.title.lower()
+            url_lower = source.url.lower()
+            if any(faq in title_lower or faq in url_lower for faq in ['faq', 'questions', 'help']):
+                formats["faq"].append(source)
+            elif any(tut in title_lower for tut in ['tutorial', 'guide', 'how to']):
+                formats["tutorial"].append(source)
+            elif any(doc in title_lower or doc in url_lower for doc in ['docs', 'documentation', 'manual']):
+                formats["documentation"].append(source)
+            elif any(int in title_lower for int in ['interview', 'q&a', 'conversation']):
+                formats["interview"].append(source)
+            else:
+                formats["other"].append(source)
+        for format_type, sources in formats.items():
+            if sources:
+                print(f"\n  {format_type.upper()}: {len(sources)} sources")
+                if sources:
+                    best = max(sources, key=lambda x: x.relevance_score)
+                    print(f"    Best: {best.title} (Score: {best.relevance_score:.1f})")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+def example_quality_assessment():
+    """
+    ✅ Example: Quality assessment and source validation
+    This example shows how to evaluate and filter sources
+    for maximum dataset quality.
+    """
+    print("\n✅ Example: Source Quality Assessment")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    # Broad search to get diverse quality levels
+    description = "Content for machine learning training including text classification and sentiment analysis"
+    try:
+        results = client.discover_sources(
+            project_description=description,
+            search_type=SearchType.GENERAL,
+            max_sources=20
+        )
+        print(f"✅ Found {len(results.sources)} total sources")
+        # Quality analysis
+        print(f"\n📊 QUALITY DISTRIBUTION")
+        print("-" * 40)
+        quality_tiers = {
+            "excellent": [s for s in results.sources if s.relevance_score >= 8.0],
+            "good": [s for s in results.sources if 6.0 <= s.relevance_score < 8.0],
+            "acceptable": [s for s in results.sources if 4.0 <= s.relevance_score < 6.0],
+            "poor": [s for s in results.sources if s.relevance_score < 4.0]
+        }
+        for tier, sources in quality_tiers.items():
+            print(f"{tier.upper()}: {len(sources)} sources")
+            if sources:
+                avg_score = sum(s.relevance_score for s in sources) / len(sources)
+                print(f"  Average score: {avg_score:.1f}")
+                print(f"  Example: {sources[0].title[:50]}...")
+        # Validate top sources
+        print(f"\n🔍 VALIDATING TOP SOURCES")
+        print("-" * 40)
+        top_sources = [s for s in results.sources if s.relevance_score >= 7.0]
+        validated_sources = client.validate_sources(top_sources)
+        print(f"Sources passed validation: {len(validated_sources)}/{len(top_sources)}")
+        # Show validation results
+        for source in validated_sources[:3]:
+            print(f"\n✅ VALIDATED: {source.title}")
+            print(f"   URL: {source.url}")
+            print(f"   Domain: {source.domain}")
+            print(f"   Type: {source.source_type}")
+            print(f"   Score: {source.relevance_score:.1f}/10")
+            print(f"   Description: {source.description[:100]}...")
+        # Export validated sources
+        if validated_sources:
+            export_data = {
+                "search_query": description,
+                "total_found": len(results.sources),
+                "validated_count": len(validated_sources),
+                "quality_threshold": 7.0,
+                "sources": [
+                    {
+                        "url": s.url,
+                        "title": s.title,
+                        "domain": s.domain,
+                        "type": s.source_type,
+                        "score": s.relevance_score,
+                        "description": s.description
+                    }
+                    for s in validated_sources
+                ]
+            }
+            filename = f"validated_sources_{int(time.time())}.json"
+            with open(filename, 'w', encoding='utf-8') as f:
+                json.dump(export_data, f, indent=2)
+            print(f"\n📄 Validated sources exported to: {filename}")
+        return validated_sources
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return []
+def example_batch_processing():
+    """
+    ⚡ Example: Batch processing for large dataset projects
+    This example demonstrates efficient batch discovery for
+    large-scale dataset creation projects.
+    """
+    print("\n⚡ Example: Batch Processing for Large Projects")
+    print("=" * 60)
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Perplexity client not available")
+        return
+    client = PerplexityClient()
+    # Define multiple related searches for comprehensive coverage
+    batch_searches = [
+        {
+            "name": "E-commerce Reviews",
+            "description": "Product reviews from online stores for sentiment analysis",
+            "search_type": SearchType.GENERAL,
+            "max_sources": 8
+        },
+        {
+            "name": "Social Media Content",
+            "description": "Social media posts and comments for sentiment classification",
+            "search_type": SearchType.SOCIAL,
+            "max_sources": 8
+        },
+        {
+            "name": "News Opinion",
+            "description": "News articles with editorial content for opinion mining",
+            "search_type": SearchType.NEWS,
+            "max_sources": 8
+        },
+        {
+            "name": "Forum Discussions",
+            "description": "Forum posts and community discussions for sentiment analysis",
+            "search_type": SearchType.GENERAL,
+            "max_sources": 6
+        }
+    ]
+    all_batch_results = []
+    total_start_time = time.time()
+    print(f"🚀 Processing {len(batch_searches)} batch searches...")
+    for i, search in enumerate(batch_searches, 1):
+        print(f"\n📍 Batch {i}/{len(batch_searches)}: {search['name']}")
+        print("-" * 40)
+        search_start = time.time()
+        try:
+            results = client.discover_sources(
+                project_description=search["description"],
+                search_type=search["search_type"],
+                max_sources=search["max_sources"]
+            )
+            search_time = time.time() - search_start
+            print(f"✅ Found {len(results.sources)} sources in {search_time:.1f}s")
+            # Add batch metadata
+            for source in results.sources:
+                source.batch_name = search["name"]
+                source.batch_index = i
+            all_batch_results.extend(results.sources)
+            # Show top result
+            if results.sources:
+                best = max(results.sources, key=lambda x: x.relevance_score)
+                print(f"   Top result: {best.title} (Score: {best.relevance_score:.1f})")
+        except Exception as e:
+            print(f"❌ Batch {i} failed: {e}")
+        # Rate limiting between batches
+        time.sleep(1.5)
+    total_time = time.time() - total_start_time
+    # Batch results analysis
+    print(f"\n📊 BATCH PROCESSING RESULTS")
+    print("-" * 40)
+    print(f"Total sources discovered: {len(all_batch_results)}")
+    print(f"Total processing time: {total_time:.1f} seconds")
+    print(f"Average per batch: {total_time/len(batch_searches):.1f} seconds")
+    # Quality distribution across batches
+    batch_stats = {}
+    for source in all_batch_results:
+        batch_name = getattr(source, 'batch_name', 'unknown')
+        if batch_name not in batch_stats:
+            batch_stats[batch_name] = {
+                'count': 0,
+                'avg_score': 0,
+                'scores': []
+            }
+        batch_stats[batch_name]['count'] += 1
+        batch_stats[batch_name]['scores'].append(source.relevance_score)
+    # Calculate averages
+    for batch_name, stats in batch_stats.items():
+        if stats['scores']:
+            stats['avg_score'] = sum(stats['scores']) / len(stats['scores'])
+    print(f"\nBatch quality comparison:")
+    for batch_name, stats in sorted(batch_stats.items(), key=lambda x: x[1]['avg_score'], reverse=True):
+        print(f"  {batch_name}: {stats['count']} sources, avg score {stats['avg_score']:.1f}")
+    # Export comprehensive results
+    batch_export = {
+        "project_name": "Large Scale Sentiment Analysis Dataset",
+        "batch_processing_date": datetime.now().isoformat(),
+        "total_sources": len(all_batch_results),
+        "processing_time_seconds": total_time,
+        "batches": len(batch_searches),
+        "batch_statistics": batch_stats,
+        "sources": [
+            {
+                "url": s.url,
+                "title": s.title,
+                "domain": s.domain,
+                "type": s.source_type,
+                "score": s.relevance_score,
+                "batch": getattr(s, 'batch_name', 'unknown'),
+                "description": s.description
+            }
+            for s in all_batch_results
+        ]
+    }
+    filename = f"batch_results_{int(time.time())}.json"
+    with open(filename, 'w', encoding='utf-8') as f:
+        json.dump(batch_export, f, indent=2)
+    print(f"\n📄 Batch results exported to: {filename}")
+    print(f"💡 Use these {len(all_batch_results)} sources to create a comprehensive sentiment analysis dataset!")
+    return all_batch_results
+def main():
+    """
+    🚀 Run all Perplexity AI examples
+    This function demonstrates the full range of capabilities
+    for AI-powered source discovery.
+    """
+    print("🚀 Perplexity AI Integration - Complete Examples")
+    print("=" * 70)
+    print("These examples show how to use AI-powered source discovery")
+    print("to create high-quality datasets efficiently.\n")
+    if not PERPLEXITY_AVAILABLE:
+        print("❌ Cannot run examples - perplexity_client.py not found")
+        print("Please ensure the perplexity_client.py file is in the same directory.")
+        return
+    if not os.getenv('PERPLEXITY_API_KEY'):
+        print("❌ Cannot run examples - PERPLEXITY_API_KEY not set")
+        print("Please set your Perplexity API key as an environment variable:")
+        print("export PERPLEXITY_API_KEY='your_api_key_here'")
+        return
+    print("✅ Perplexity AI client available and configured")
+    print("🎯 Running comprehensive examples...\n")
+    try:
+        # Run all examples
+        sentiment_sources = example_sentiment_analysis_sources()
+        time.sleep(2)  # Respectful delay
+        classification_sources = example_text_classification_sources()
+        time.sleep(2)
+        academic_sources = example_academic_research_sources()
+        time.sleep(2)
+        example_custom_search_strategies()
+        time.sleep(2)
+        validated_sources = example_quality_assessment()
+        time.sleep(2)
+        batch_sources = example_batch_processing()
+        # Final summary
+        print(f"\n🎉 EXAMPLES COMPLETE!")
+        print("=" * 70)
+        print("Summary of discovered sources:")
+        total_sources = 0
+        if sentiment_sources:
+            total_sources += len(sentiment_sources)
+            print(f"  📊 Sentiment Analysis: {len(sentiment_sources)} sources")
+        if classification_sources:
+            total_sources += len(classification_sources)
+            print(f"  📂 Text Classification: {len(classification_sources)} sources")
+        if academic_sources:
+            total_sources += len(academic_sources)
+            print(f"  🎓 Academic Research: {len(academic_sources)} sources")
+        if validated_sources:
+            print(f"  ✅ Validated High-Quality: {len(validated_sources)} sources")
+        if batch_sources:
+            print(f"  ⚡ Batch Processing: {len(batch_sources)} sources")
+        print(f"\n🎯 Total unique sources discovered: {total_sources}")
+        print("📄 Check the generated JSON files for detailed source information")
+        print("\n💡 Next steps:")
+        print("  1. Review the exported source files")
+        print("  2. Select the best sources for your specific use case")
+        print("  3. Use these sources in your AI Dataset Studio")
+        print("  4. Create amazing datasets with AI-powered discovery!")
+    except Exception as e:
+        print(f"❌ Error running examples: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()