MagicMeWizard commited on
Commit
c3b493b
Β·
verified Β·
1 Parent(s): dbedabb

Create examples.py

Browse files
Files changed (1) hide show
  1. examples.py +731 -0
examples.py ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ πŸ“š Perplexity AI Integration Examples
3
+ Demonstrate how to effectively use AI-powered source discovery for dataset creation
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import time
9
+ from typing import List, Dict
10
+ from datetime import datetime
11
+
12
+ # Import our Perplexity client
13
+ try:
14
+ from perplexity_client import PerplexityClient, SearchType, SourceResult
15
+ PERPLEXITY_AVAILABLE = True
16
+ except ImportError:
17
+ print("⚠️ Perplexity client not available. Make sure perplexity_client.py is in the same directory.")
18
+ PERPLEXITY_AVAILABLE = False
19
+
20
+ def example_sentiment_analysis_sources():
21
+ """
22
+ πŸ“Š Example: Find sources for sentiment analysis dataset
23
+
24
+ This example shows how to discover diverse sources for sentiment analysis,
25
+ including product reviews, social media, and news content.
26
+ """
27
+ print("πŸ“Š Example: Sentiment Analysis Source Discovery")
28
+ print("=" * 60)
29
+
30
+ if not PERPLEXITY_AVAILABLE:
31
+ print("❌ Perplexity client not available")
32
+ return
33
+
34
+ client = PerplexityClient()
35
+
36
+ if not client._validate_api_key():
37
+ print("❌ Please set PERPLEXITY_API_KEY environment variable")
38
+ return
39
+
40
+ # Different types of sentiment analysis projects
41
+ projects = [
42
+ {
43
+ "description": "Product reviews from e-commerce sites for sentiment classification of customer feedback",
44
+ "search_type": SearchType.GENERAL,
45
+ "focus": "E-commerce reviews"
46
+ },
47
+ {
48
+ "description": "Movie and entertainment reviews for sentiment analysis training with detailed ratings",
49
+ "search_type": SearchType.GENERAL,
50
+ "focus": "Entertainment reviews"
51
+ },
52
+ {
53
+ "description": "Social media posts and comments about brands for real-time sentiment monitoring",
54
+ "search_type": SearchType.SOCIAL,
55
+ "focus": "Social media sentiment"
56
+ },
57
+ {
58
+ "description": "News articles with opinion content for political sentiment analysis research",
59
+ "search_type": SearchType.NEWS,
60
+ "focus": "News opinion analysis"
61
+ }
62
+ ]
63
+
64
+ all_results = []
65
+
66
+ for i, project in enumerate(projects, 1):
67
+ print(f"\nπŸ” Project {i}: {project['focus']}")
68
+ print("-" * 40)
69
+
70
+ try:
71
+ results = client.discover_sources(
72
+ project_description=project["description"],
73
+ search_type=project["search_type"],
74
+ max_sources=8,
75
+ include_academic=False, # Focus on practical sources
76
+ include_news=True
77
+ )
78
+
79
+ print(f"βœ… Found {len(results.sources)} sources in {results.search_time:.1f}s")
80
+
81
+ # Show top 3 sources
82
+ for j, source in enumerate(results.sources[:3], 1):
83
+ print(f" {j}. {source.title}")
84
+ print(f" URL: {source.url}")
85
+ print(f" Type: {source.source_type} | Score: {source.relevance_score:.1f}/10")
86
+ print(f" Description: {source.description[:100]}...")
87
+ print()
88
+
89
+ all_results.extend(results.sources)
90
+
91
+ if results.suggestions:
92
+ print(f"πŸ’‘ Suggestions: {', '.join(results.suggestions[:3])}")
93
+
94
+ except Exception as e:
95
+ print(f"❌ Error: {e}")
96
+
97
+ # Respectful delay between requests
98
+ time.sleep(1)
99
+
100
+ # Summary
101
+ print(f"\nπŸ“Š SUMMARY")
102
+ print("-" * 40)
103
+ print(f"Total sources discovered: {len(all_results)}")
104
+
105
+ # Analyze source types
106
+ source_types = {}
107
+ for source in all_results:
108
+ source_types[source.source_type] = source_types.get(source.source_type, 0) + 1
109
+
110
+ print("Source type distribution:")
111
+ for stype, count in sorted(source_types.items()):
112
+ print(f" {stype}: {count} sources")
113
+
114
+ # Top domains
115
+ domains = {}
116
+ for source in all_results:
117
+ domains[source.domain] = domains.get(source.domain, 0) + 1
118
+
119
+ print("\nTop domains:")
120
+ for domain, count in sorted(domains.items(), key=lambda x: x[1], reverse=True)[:5]:
121
+ print(f" {domain}: {count} sources")
122
+
123
+ return all_results
124
+
125
+ def example_text_classification_sources():
126
+ """
127
+ πŸ“‚ Example: Find sources for text classification dataset
128
+
129
+ This example demonstrates finding well-categorized content for
130
+ multi-class text classification training.
131
+ """
132
+ print("\nπŸ“‚ Example: Text Classification Source Discovery")
133
+ print("=" * 60)
134
+
135
+ if not PERPLEXITY_AVAILABLE:
136
+ print("❌ Perplexity client not available")
137
+ return
138
+
139
+ client = PerplexityClient()
140
+
141
+ # Multi-domain classification project
142
+ project_description = """
143
+ Find diverse news articles and content with clear topical categories for training
144
+ a multi-class text classifier. Need sources covering politics, technology, sports,
145
+ business, entertainment, health, and science topics with consistent categorization.
146
+ """
147
+
148
+ try:
149
+ results = client.discover_sources(
150
+ project_description=project_description,
151
+ search_type=SearchType.NEWS,
152
+ max_sources=15,
153
+ include_academic=True, # Include academic sources for science topics
154
+ include_news=True
155
+ )
156
+
157
+ print(f"βœ… Found {len(results.sources)} sources")
158
+
159
+ # Categorize sources by likely content type
160
+ categorized = {
161
+ "news": [],
162
+ "academic": [],
163
+ "business": [],
164
+ "technology": [],
165
+ "other": []
166
+ }
167
+
168
+ for source in results.sources:
169
+ domain = source.domain.lower()
170
+ if any(news in domain for news in ['reuters', 'bbc', 'cnn', 'news']):
171
+ categorized["news"].append(source)
172
+ elif any(academic in domain for academic in ['arxiv', 'pubmed', 'scholar', 'edu']):
173
+ categorized["academic"].append(source)
174
+ elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business', 'financial']):
175
+ categorized["business"].append(source)
176
+ elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech', 'digital']):
177
+ categorized["technology"].append(source)
178
+ else:
179
+ categorized["other"].append(source)
180
+
181
+ print("\nπŸ“‹ Sources by Category:")
182
+ for category, sources in categorized.items():
183
+ if sources:
184
+ print(f"\n{category.upper()} ({len(sources)} sources):")
185
+ for source in sources[:2]: # Show top 2 per category
186
+ print(f" β€’ {source.title}")
187
+ print(f" {source.url}")
188
+ print(f" Score: {source.relevance_score:.1f}/10")
189
+
190
+ # Export for use
191
+ export_data = client.export_sources(results, "json")
192
+
193
+ # Save to file
194
+ filename = f"text_classification_sources_{int(time.time())}.json"
195
+ with open(filename, 'w', encoding='utf-8') as f:
196
+ f.write(export_data)
197
+
198
+ print(f"\nπŸ“„ Sources exported to: {filename}")
199
+
200
+ return results.sources
201
+
202
+ except Exception as e:
203
+ print(f"❌ Error: {e}")
204
+ return []
205
+
206
+ def example_academic_research_sources():
207
+ """
208
+ πŸŽ“ Example: Find academic sources for research dataset
209
+
210
+ This example shows how to discover high-quality academic sources
211
+ for research-focused datasets.
212
+ """
213
+ print("\nπŸŽ“ Example: Academic Research Source Discovery")
214
+ print("=" * 60)
215
+
216
+ if not PERPLEXITY_AVAILABLE:
217
+ print("❌ Perplexity client not available")
218
+ return
219
+
220
+ client = PerplexityClient()
221
+
222
+ # Research-focused projects
223
+ research_topics = [
224
+ {
225
+ "description": "Recent machine learning research papers on transformer architectures and attention mechanisms for NLP survey dataset",
226
+ "domain_focus": "AI/ML research"
227
+ },
228
+ {
229
+ "description": "Climate change research papers and reports for environmental science text summarization training",
230
+ "domain_focus": "Climate science"
231
+ },
232
+ {
233
+ "description": "Medical research papers on drug discovery and pharmaceutical research for biomedical NER training",
234
+ "domain_focus": "Medical research"
235
+ }
236
+ ]
237
+
238
+ all_academic_sources = []
239
+
240
+ for topic in research_topics:
241
+ print(f"\nπŸ”¬ Research Topic: {topic['domain_focus']}")
242
+ print("-" * 40)
243
+
244
+ try:
245
+ results = client.discover_sources(
246
+ project_description=topic["description"],
247
+ search_type=SearchType.ACADEMIC,
248
+ max_sources=10,
249
+ include_academic=True,
250
+ include_news=False # Focus on academic sources only
251
+ )
252
+
253
+ print(f"βœ… Found {len(results.sources)} academic sources")
254
+
255
+ # Filter for high-quality academic sources
256
+ high_quality = [s for s in results.sources if s.relevance_score >= 7.0]
257
+
258
+ print(f"πŸ“š High-quality sources (score β‰₯ 7.0): {len(high_quality)}")
259
+
260
+ for source in high_quality[:3]:
261
+ print(f"\n πŸ“„ {source.title}")
262
+ print(f" URL: {source.url}")
263
+ print(f" Domain: {source.domain}")
264
+ print(f" Score: {source.relevance_score:.1f}/10")
265
+ print(f" Type: {source.source_type}")
266
+
267
+ all_academic_sources.extend(high_quality)
268
+
269
+ except Exception as e:
270
+ print(f"❌ Error: {e}")
271
+
272
+ time.sleep(1) # Respectful delay
273
+
274
+ # Analysis
275
+ print(f"\nπŸ“Š ACADEMIC SOURCES ANALYSIS")
276
+ print("-" * 40)
277
+ print(f"Total high-quality academic sources: {len(all_academic_sources)}")
278
+
279
+ # Domain analysis
280
+ academic_domains = {}
281
+ for source in all_academic_sources:
282
+ domain = source.domain
283
+ academic_domains[domain] = academic_domains.get(domain, 0) + 1
284
+
285
+ print("\nTop academic domains:")
286
+ for domain, count in sorted(academic_domains.items(), key=lambda x: x[1], reverse=True)[:5]:
287
+ print(f" {domain}: {count} papers")
288
+
289
+ # Quality distribution
290
+ scores = [s.relevance_score for s in all_academic_sources]
291
+ if scores:
292
+ avg_score = sum(scores) / len(scores)
293
+ print(f"\nAverage quality score: {avg_score:.1f}/10")
294
+ print(f"Score range: {min(scores):.1f} - {max(scores):.1f}")
295
+
296
+ return all_academic_sources
297
+
298
+ def example_custom_search_strategies():
299
+ """
300
+ 🎯 Example: Custom search strategies for specific needs
301
+
302
+ This example demonstrates advanced techniques for finding
303
+ very specific types of content.
304
+ """
305
+ print("\n🎯 Example: Custom Search Strategies")
306
+ print("=" * 60)
307
+
308
+ if not PERPLEXITY_AVAILABLE:
309
+ print("❌ Perplexity client not available")
310
+ return
311
+
312
+ client = PerplexityClient()
313
+
314
+ # Strategy 1: Domain-specific search
315
+ print("\nπŸ” Strategy 1: Domain-specific Financial Content")
316
+ print("-" * 50)
317
+
318
+ try:
319
+ financial_results = client.get_domain_sources(
320
+ domain="bloomberg.com",
321
+ topic="quarterly earnings reports and financial analysis",
322
+ max_sources=5
323
+ )
324
+
325
+ print(f"βœ… Found {len(financial_results.sources)} financial sources")
326
+ for source in financial_results.sources[:2]:
327
+ print(f" β€’ {source.title}")
328
+ print(f" Score: {source.relevance_score:.1f}/10")
329
+
330
+ except Exception as e:
331
+ print(f"❌ Error: {e}")
332
+
333
+ # Strategy 2: Keyword-based search
334
+ print("\nπŸ” Strategy 2: Keyword-based Technical Content")
335
+ print("-" * 50)
336
+
337
+ try:
338
+ tech_keywords = ["API documentation", "software tutorials", "programming guides", "technical specifications"]
339
+ tech_results = client.search_with_keywords(
340
+ keywords=tech_keywords,
341
+ search_type=SearchType.TECHNICAL
342
+ )
343
+
344
+ print(f"βœ… Found {len(tech_results.sources)} technical sources")
345
+ for source in tech_results.sources[:2]:
346
+ print(f" β€’ {source.title}")
347
+ print(f" Type: {source.source_type}")
348
+
349
+ except Exception as e:
350
+ print(f"❌ Error: {e}")
351
+
352
+ # Strategy 3: Multi-format search
353
+ print("\nπŸ” Strategy 3: Multi-format Content Discovery")
354
+ print("-" * 50)
355
+
356
+ multiformat_description = """
357
+ Find diverse content formats including FAQ pages, interview transcripts,
358
+ tutorial content, and documentation for question-answering dataset creation.
359
+ Need sources with clear question-answer patterns and structured information.
360
+ """
361
+
362
+ try:
363
+ qa_results = client.discover_sources(
364
+ project_description=multiformat_description,
365
+ search_type=SearchType.GENERAL,
366
+ max_sources=12
367
+ )
368
+
369
+ print(f"βœ… Found {len(qa_results.sources)} Q&A sources")
370
+
371
+ # Categorize by content format
372
+ formats = {
373
+ "faq": [],
374
+ "tutorial": [],
375
+ "documentation": [],
376
+ "interview": [],
377
+ "other": []
378
+ }
379
+
380
+ for source in qa_results.sources:
381
+ title_lower = source.title.lower()
382
+ url_lower = source.url.lower()
383
+
384
+ if any(faq in title_lower or faq in url_lower for faq in ['faq', 'questions', 'help']):
385
+ formats["faq"].append(source)
386
+ elif any(tut in title_lower for tut in ['tutorial', 'guide', 'how to']):
387
+ formats["tutorial"].append(source)
388
+ elif any(doc in title_lower or doc in url_lower for doc in ['docs', 'documentation', 'manual']):
389
+ formats["documentation"].append(source)
390
+ elif any(int in title_lower for int in ['interview', 'q&a', 'conversation']):
391
+ formats["interview"].append(source)
392
+ else:
393
+ formats["other"].append(source)
394
+
395
+ for format_type, sources in formats.items():
396
+ if sources:
397
+ print(f"\n {format_type.upper()}: {len(sources)} sources")
398
+ if sources:
399
+ best = max(sources, key=lambda x: x.relevance_score)
400
+ print(f" Best: {best.title} (Score: {best.relevance_score:.1f})")
401
+
402
+ except Exception as e:
403
+ print(f"❌ Error: {e}")
404
+
405
+ def example_quality_assessment():
406
+ """
407
+ βœ… Example: Quality assessment and source validation
408
+
409
+ This example shows how to evaluate and filter sources
410
+ for maximum dataset quality.
411
+ """
412
+ print("\nβœ… Example: Source Quality Assessment")
413
+ print("=" * 60)
414
+
415
+ if not PERPLEXITY_AVAILABLE:
416
+ print("❌ Perplexity client not available")
417
+ return
418
+
419
+ client = PerplexityClient()
420
+
421
+ # Broad search to get diverse quality levels
422
+ description = "Content for machine learning training including text classification and sentiment analysis"
423
+
424
+ try:
425
+ results = client.discover_sources(
426
+ project_description=description,
427
+ search_type=SearchType.GENERAL,
428
+ max_sources=20
429
+ )
430
+
431
+ print(f"βœ… Found {len(results.sources)} total sources")
432
+
433
+ # Quality analysis
434
+ print(f"\nπŸ“Š QUALITY DISTRIBUTION")
435
+ print("-" * 40)
436
+
437
+ quality_tiers = {
438
+ "excellent": [s for s in results.sources if s.relevance_score >= 8.0],
439
+ "good": [s for s in results.sources if 6.0 <= s.relevance_score < 8.0],
440
+ "acceptable": [s for s in results.sources if 4.0 <= s.relevance_score < 6.0],
441
+ "poor": [s for s in results.sources if s.relevance_score < 4.0]
442
+ }
443
+
444
+ for tier, sources in quality_tiers.items():
445
+ print(f"{tier.upper()}: {len(sources)} sources")
446
+ if sources:
447
+ avg_score = sum(s.relevance_score for s in sources) / len(sources)
448
+ print(f" Average score: {avg_score:.1f}")
449
+ print(f" Example: {sources[0].title[:50]}...")
450
+
451
+ # Validate top sources
452
+ print(f"\nπŸ” VALIDATING TOP SOURCES")
453
+ print("-" * 40)
454
+
455
+ top_sources = [s for s in results.sources if s.relevance_score >= 7.0]
456
+ validated_sources = client.validate_sources(top_sources)
457
+
458
+ print(f"Sources passed validation: {len(validated_sources)}/{len(top_sources)}")
459
+
460
+ # Show validation results
461
+ for source in validated_sources[:3]:
462
+ print(f"\nβœ… VALIDATED: {source.title}")
463
+ print(f" URL: {source.url}")
464
+ print(f" Domain: {source.domain}")
465
+ print(f" Type: {source.source_type}")
466
+ print(f" Score: {source.relevance_score:.1f}/10")
467
+ print(f" Description: {source.description[:100]}...")
468
+
469
+ # Export validated sources
470
+ if validated_sources:
471
+ export_data = {
472
+ "search_query": description,
473
+ "total_found": len(results.sources),
474
+ "validated_count": len(validated_sources),
475
+ "quality_threshold": 7.0,
476
+ "sources": [
477
+ {
478
+ "url": s.url,
479
+ "title": s.title,
480
+ "domain": s.domain,
481
+ "type": s.source_type,
482
+ "score": s.relevance_score,
483
+ "description": s.description
484
+ }
485
+ for s in validated_sources
486
+ ]
487
+ }
488
+
489
+ filename = f"validated_sources_{int(time.time())}.json"
490
+ with open(filename, 'w', encoding='utf-8') as f:
491
+ json.dump(export_data, f, indent=2)
492
+
493
+ print(f"\nπŸ“„ Validated sources exported to: {filename}")
494
+
495
+ return validated_sources
496
+
497
+ except Exception as e:
498
+ print(f"❌ Error: {e}")
499
+ return []
500
+
501
+ def example_batch_processing():
502
+ """
503
+ ⚑ Example: Batch processing for large dataset projects
504
+
505
+ This example demonstrates efficient batch discovery for
506
+ large-scale dataset creation projects.
507
+ """
508
+ print("\n⚑ Example: Batch Processing for Large Projects")
509
+ print("=" * 60)
510
+
511
+ if not PERPLEXITY_AVAILABLE:
512
+ print("❌ Perplexity client not available")
513
+ return
514
+
515
+ client = PerplexityClient()
516
+
517
+ # Define multiple related searches for comprehensive coverage
518
+ batch_searches = [
519
+ {
520
+ "name": "E-commerce Reviews",
521
+ "description": "Product reviews from online stores for sentiment analysis",
522
+ "search_type": SearchType.GENERAL,
523
+ "max_sources": 8
524
+ },
525
+ {
526
+ "name": "Social Media Content",
527
+ "description": "Social media posts and comments for sentiment classification",
528
+ "search_type": SearchType.SOCIAL,
529
+ "max_sources": 8
530
+ },
531
+ {
532
+ "name": "News Opinion",
533
+ "description": "News articles with editorial content for opinion mining",
534
+ "search_type": SearchType.NEWS,
535
+ "max_sources": 8
536
+ },
537
+ {
538
+ "name": "Forum Discussions",
539
+ "description": "Forum posts and community discussions for sentiment analysis",
540
+ "search_type": SearchType.GENERAL,
541
+ "max_sources": 6
542
+ }
543
+ ]
544
+
545
+ all_batch_results = []
546
+ total_start_time = time.time()
547
+
548
+ print(f"πŸš€ Processing {len(batch_searches)} batch searches...")
549
+
550
+ for i, search in enumerate(batch_searches, 1):
551
+ print(f"\nπŸ“ Batch {i}/{len(batch_searches)}: {search['name']}")
552
+ print("-" * 40)
553
+
554
+ search_start = time.time()
555
+
556
+ try:
557
+ results = client.discover_sources(
558
+ project_description=search["description"],
559
+ search_type=search["search_type"],
560
+ max_sources=search["max_sources"]
561
+ )
562
+
563
+ search_time = time.time() - search_start
564
+
565
+ print(f"βœ… Found {len(results.sources)} sources in {search_time:.1f}s")
566
+
567
+ # Add batch metadata
568
+ for source in results.sources:
569
+ source.batch_name = search["name"]
570
+ source.batch_index = i
571
+
572
+ all_batch_results.extend(results.sources)
573
+
574
+ # Show top result
575
+ if results.sources:
576
+ best = max(results.sources, key=lambda x: x.relevance_score)
577
+ print(f" Top result: {best.title} (Score: {best.relevance_score:.1f})")
578
+
579
+ except Exception as e:
580
+ print(f"❌ Batch {i} failed: {e}")
581
+
582
+ # Rate limiting between batches
583
+ time.sleep(1.5)
584
+
585
+ total_time = time.time() - total_start_time
586
+
587
+ # Batch results analysis
588
+ print(f"\nπŸ“Š BATCH PROCESSING RESULTS")
589
+ print("-" * 40)
590
+ print(f"Total sources discovered: {len(all_batch_results)}")
591
+ print(f"Total processing time: {total_time:.1f} seconds")
592
+ print(f"Average per batch: {total_time/len(batch_searches):.1f} seconds")
593
+
594
+ # Quality distribution across batches
595
+ batch_stats = {}
596
+ for source in all_batch_results:
597
+ batch_name = getattr(source, 'batch_name', 'unknown')
598
+ if batch_name not in batch_stats:
599
+ batch_stats[batch_name] = {
600
+ 'count': 0,
601
+ 'avg_score': 0,
602
+ 'scores': []
603
+ }
604
+
605
+ batch_stats[batch_name]['count'] += 1
606
+ batch_stats[batch_name]['scores'].append(source.relevance_score)
607
+
608
+ # Calculate averages
609
+ for batch_name, stats in batch_stats.items():
610
+ if stats['scores']:
611
+ stats['avg_score'] = sum(stats['scores']) / len(stats['scores'])
612
+
613
+ print(f"\nBatch quality comparison:")
614
+ for batch_name, stats in sorted(batch_stats.items(), key=lambda x: x[1]['avg_score'], reverse=True):
615
+ print(f" {batch_name}: {stats['count']} sources, avg score {stats['avg_score']:.1f}")
616
+
617
+ # Export comprehensive results
618
+ batch_export = {
619
+ "project_name": "Large Scale Sentiment Analysis Dataset",
620
+ "batch_processing_date": datetime.now().isoformat(),
621
+ "total_sources": len(all_batch_results),
622
+ "processing_time_seconds": total_time,
623
+ "batches": len(batch_searches),
624
+ "batch_statistics": batch_stats,
625
+ "sources": [
626
+ {
627
+ "url": s.url,
628
+ "title": s.title,
629
+ "domain": s.domain,
630
+ "type": s.source_type,
631
+ "score": s.relevance_score,
632
+ "batch": getattr(s, 'batch_name', 'unknown'),
633
+ "description": s.description
634
+ }
635
+ for s in all_batch_results
636
+ ]
637
+ }
638
+
639
+ filename = f"batch_results_{int(time.time())}.json"
640
+ with open(filename, 'w', encoding='utf-8') as f:
641
+ json.dump(batch_export, f, indent=2)
642
+
643
+ print(f"\nπŸ“„ Batch results exported to: {filename}")
644
+ print(f"πŸ’‘ Use these {len(all_batch_results)} sources to create a comprehensive sentiment analysis dataset!")
645
+
646
+ return all_batch_results
647
+
648
+ def main():
649
+ """
650
+ πŸš€ Run all Perplexity AI examples
651
+
652
+ This function demonstrates the full range of capabilities
653
+ for AI-powered source discovery.
654
+ """
655
+ print("πŸš€ Perplexity AI Integration - Complete Examples")
656
+ print("=" * 70)
657
+ print("These examples show how to use AI-powered source discovery")
658
+ print("to create high-quality datasets efficiently.\n")
659
+
660
+ if not PERPLEXITY_AVAILABLE:
661
+ print("❌ Cannot run examples - perplexity_client.py not found")
662
+ print("Please ensure the perplexity_client.py file is in the same directory.")
663
+ return
664
+
665
+ if not os.getenv('PERPLEXITY_API_KEY'):
666
+ print("❌ Cannot run examples - PERPLEXITY_API_KEY not set")
667
+ print("Please set your Perplexity API key as an environment variable:")
668
+ print("export PERPLEXITY_API_KEY='your_api_key_here'")
669
+ return
670
+
671
+ print("βœ… Perplexity AI client available and configured")
672
+ print("🎯 Running comprehensive examples...\n")
673
+
674
+ try:
675
+ # Run all examples
676
+ sentiment_sources = example_sentiment_analysis_sources()
677
+ time.sleep(2) # Respectful delay
678
+
679
+ classification_sources = example_text_classification_sources()
680
+ time.sleep(2)
681
+
682
+ academic_sources = example_academic_research_sources()
683
+ time.sleep(2)
684
+
685
+ example_custom_search_strategies()
686
+ time.sleep(2)
687
+
688
+ validated_sources = example_quality_assessment()
689
+ time.sleep(2)
690
+
691
+ batch_sources = example_batch_processing()
692
+
693
+ # Final summary
694
+ print(f"\nπŸŽ‰ EXAMPLES COMPLETE!")
695
+ print("=" * 70)
696
+ print("Summary of discovered sources:")
697
+
698
+ total_sources = 0
699
+ if sentiment_sources:
700
+ total_sources += len(sentiment_sources)
701
+ print(f" πŸ“Š Sentiment Analysis: {len(sentiment_sources)} sources")
702
+
703
+ if classification_sources:
704
+ total_sources += len(classification_sources)
705
+ print(f" πŸ“‚ Text Classification: {len(classification_sources)} sources")
706
+
707
+ if academic_sources:
708
+ total_sources += len(academic_sources)
709
+ print(f" πŸŽ“ Academic Research: {len(academic_sources)} sources")
710
+
711
+ if validated_sources:
712
+ print(f" βœ… Validated High-Quality: {len(validated_sources)} sources")
713
+
714
+ if batch_sources:
715
+ print(f" ⚑ Batch Processing: {len(batch_sources)} sources")
716
+
717
+ print(f"\n🎯 Total unique sources discovered: {total_sources}")
718
+ print("πŸ“„ Check the generated JSON files for detailed source information")
719
+ print("\nπŸ’‘ Next steps:")
720
+ print(" 1. Review the exported source files")
721
+ print(" 2. Select the best sources for your specific use case")
722
+ print(" 3. Use these sources in your AI Dataset Studio")
723
+ print(" 4. Create amazing datasets with AI-powered discovery!")
724
+
725
+ except Exception as e:
726
+ print(f"❌ Error running examples: {e}")
727
+ import traceback
728
+ traceback.print_exc()
729
+
730
+ if __name__ == "__main__":
731
+ main()