MagicMeWizard commited on
Commit
6b9c591
Β·
verified Β·
1 Parent(s): 1d9e7b0

Create perplexity_client.py

Browse files
Files changed (1) hide show
  1. perplexity_client.py +724 -0
perplexity_client.py ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 🧠 Perplexity AI Integration for AI Dataset Studio
3
+ Automatically discovers relevant sources based on project descriptions
4
+ """
5
+
6
+ import os
7
+ import requests
8
+ import json
9
+ import logging
10
+ import time
11
+ import re
12
+ from typing import List, Dict, Optional, Tuple
13
+ from urllib.parse import urlparse, urljoin
14
+ from dataclasses import dataclass
15
+ from enum import Enum
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class SearchType(Enum):
22
+ """Types of searches supported by Perplexity AI"""
23
+ GENERAL = "general"
24
+ ACADEMIC = "academic"
25
+ NEWS = "news"
26
+ SOCIAL = "social"
27
+ TECHNICAL = "technical"
28
+
29
+ @dataclass
30
+ class SourceResult:
31
+ """Structure for individual source results"""
32
+ url: str
33
+ title: str
34
+ description: str
35
+ relevance_score: float
36
+ source_type: str
37
+ domain: str
38
+ publication_date: Optional[str] = None
39
+ author: Optional[str] = None
40
+
41
+ @dataclass
42
+ class SearchResults:
43
+ """Container for search results"""
44
+ query: str
45
+ sources: List[SourceResult]
46
+ total_found: int
47
+ search_time: float
48
+ perplexity_response: str
49
+ suggestions: List[str]
50
+
51
+ class PerplexityClient:
52
+ """
53
+ 🧠 Perplexity AI Client for Smart Source Discovery
54
+
55
+ Features:
56
+ - Intelligent source discovery based on project descriptions
57
+ - Multiple search strategies (academic, news, technical, etc.)
58
+ - Quality filtering and relevance scoring
59
+ - Rate limiting and error handling
60
+ - Domain validation and safety checks
61
+ """
62
+
63
+ def __init__(self, api_key: Optional[str] = None):
64
+ """
65
+ Initialize Perplexity AI client
66
+
67
+ Args:
68
+ api_key: Perplexity API key (if not provided, will try env var)
69
+ """
70
+ self.api_key = api_key or os.getenv('PERPLEXITY_API_KEY')
71
+ self.base_url = "https://api.perplexity.ai"
72
+ self.session = requests.Session()
73
+
74
+ # Set up headers
75
+ if self.api_key:
76
+ self.session.headers.update({
77
+ 'Authorization': f'Bearer {self.api_key}',
78
+ 'Content-Type': 'application/json',
79
+ 'User-Agent': 'AI-Dataset-Studio/1.0'
80
+ })
81
+
82
+ # Rate limiting
83
+ self.last_request_time = 0
84
+ self.min_request_interval = 1.0 # Seconds between requests
85
+
86
+ # Configuration
87
+ self.max_retries = 3
88
+ self.timeout = 30
89
+
90
+ logger.info("🧠 Perplexity AI client initialized")
91
+
92
+ def _validate_api_key(self) -> bool:
93
+ """Validate that API key is available and working"""
94
+ if not self.api_key:
95
+ logger.error("❌ No Perplexity API key found. Set PERPLEXITY_API_KEY environment variable.")
96
+ return False
97
+ return True
98
+
99
+ def _rate_limit(self):
100
+ """Implement rate limiting to respect API limits"""
101
+ current_time = time.time()
102
+ time_since_last = current_time - self.last_request_time
103
+
104
+ if time_since_last < self.min_request_interval:
105
+ sleep_time = self.min_request_interval - time_since_last
106
+ logger.debug(f"⏱️ Rate limiting: sleeping {sleep_time:.2f}s")
107
+ time.sleep(sleep_time)
108
+
109
+ self.last_request_time = time.time()
110
+
111
+ def _make_request(self, payload: Dict) -> Optional[Dict]:
112
+ """
113
+ Make API request to Perplexity with error handling
114
+
115
+ Args:
116
+ payload: Request payload
117
+
118
+ Returns:
119
+ API response or None if failed
120
+ """
121
+ if not self._validate_api_key():
122
+ return None
123
+
124
+ self._rate_limit()
125
+
126
+ for attempt in range(self.max_retries):
127
+ try:
128
+ logger.debug(f"πŸ“‘ Making Perplexity API request (attempt {attempt + 1})")
129
+
130
+ response = self.session.post(
131
+ f"{self.base_url}/chat/completions",
132
+ json=payload,
133
+ timeout=self.timeout
134
+ )
135
+
136
+ if response.status_code == 200:
137
+ logger.debug("βœ… Perplexity API request successful")
138
+ return response.json()
139
+ elif response.status_code == 429:
140
+ logger.warning("🚦 Rate limit hit, waiting longer...")
141
+ time.sleep(2 ** attempt) # Exponential backoff
142
+ continue
143
+ else:
144
+ logger.error(f"❌ API request failed: {response.status_code} - {response.text}")
145
+
146
+ except requests.exceptions.Timeout:
147
+ logger.warning(f"⏰ Request timeout (attempt {attempt + 1})")
148
+ except requests.exceptions.RequestException as e:
149
+ logger.error(f"πŸ”Œ Request error: {str(e)}")
150
+
151
+ if attempt < self.max_retries - 1:
152
+ time.sleep(2 ** attempt) # Exponential backoff
153
+
154
+ logger.error("❌ All retry attempts failed")
155
+ return None
156
+
157
+ def discover_sources(
158
+ self,
159
+ project_description: str,
160
+ search_type: SearchType = SearchType.GENERAL,
161
+ max_sources: int = 20,
162
+ include_academic: bool = True,
163
+ include_news: bool = True,
164
+ domain_filter: Optional[List[str]] = None
165
+ ) -> SearchResults:
166
+ """
167
+ πŸ” Discover relevant sources based on project description
168
+
169
+ Args:
170
+ project_description: User's project description
171
+ search_type: Type of search to perform
172
+ max_sources: Maximum number of sources to return
173
+ include_academic: Include academic sources
174
+ include_news: Include news sources
175
+ domain_filter: Optional list of domains to focus on
176
+
177
+ Returns:
178
+ SearchResults object with discovered sources
179
+ """
180
+ start_time = time.time()
181
+
182
+ logger.info(f"πŸ” Discovering sources for: {project_description[:100]}...")
183
+
184
+ # Build search prompt
185
+ search_prompt = self._build_search_prompt(
186
+ project_description,
187
+ search_type,
188
+ max_sources,
189
+ include_academic,
190
+ include_news,
191
+ domain_filter
192
+ )
193
+
194
+ # Prepare API payload
195
+ payload = {
196
+ "model": "llama-3.1-sonar-large-128k-online",
197
+ "messages": [
198
+ {
199
+ "role": "system",
200
+ "content": "You are an expert research assistant specializing in finding high-quality, relevant sources for AI/ML dataset creation. Always provide specific URLs, titles, and descriptions."
201
+ },
202
+ {
203
+ "role": "user",
204
+ "content": search_prompt
205
+ }
206
+ ],
207
+ "max_tokens": 4000,
208
+ "temperature": 0.3,
209
+ "top_p": 0.9
210
+ }
211
+
212
+ # Make API request
213
+ response = self._make_request(payload)
214
+
215
+ if not response:
216
+ logger.error("❌ Failed to get response from Perplexity API")
217
+ return self._create_empty_results(project_description, time.time() - start_time)
218
+
219
+ # Parse response and extract sources
220
+ try:
221
+ content = response['choices'][0]['message']['content']
222
+ sources = self._parse_sources_from_response(content)
223
+ suggestions = self._extract_suggestions(content)
224
+
225
+ search_time = time.time() - start_time
226
+
227
+ logger.info(f"βœ… Found {len(sources)} sources in {search_time:.2f}s")
228
+
229
+ return SearchResults(
230
+ query=project_description,
231
+ sources=sources[:max_sources],
232
+ total_found=len(sources),
233
+ search_time=search_time,
234
+ perplexity_response=content,
235
+ suggestions=suggestions
236
+ )
237
+
238
+ except Exception as e:
239
+ logger.error(f"❌ Error parsing Perplexity response: {str(e)}")
240
+ return self._create_empty_results(project_description, time.time() - start_time)
241
+
242
+ def _build_search_prompt(
243
+ self,
244
+ project_description: str,
245
+ search_type: SearchType,
246
+ max_sources: int,
247
+ include_academic: bool,
248
+ include_news: bool,
249
+ domain_filter: Optional[List[str]]
250
+ ) -> str:
251
+ """Build optimized search prompt for Perplexity AI"""
252
+
253
+ prompt = f"""
254
+ Find {max_sources} high-quality, diverse sources for an AI/ML dataset creation project:
255
+
256
+ PROJECT DESCRIPTION: {project_description}
257
+
258
+ SEARCH REQUIREMENTS:
259
+ - Find sources with extractable text content suitable for ML training
260
+ - Prioritize sources with structured, high-quality content
261
+ - Include diverse perspectives and data types
262
+ - Focus on sources that are legally scrapable (respect robots.txt)
263
+
264
+ SEARCH TYPE: {search_type.value}
265
+ """
266
+
267
+ if include_academic:
268
+ prompt += "\n- Include academic papers, research articles, and scholarly sources"
269
+
270
+ if include_news:
271
+ prompt += "\n- Include news articles, press releases, and journalistic content"
272
+
273
+ if domain_filter:
274
+ prompt += f"\n- Focus on these domains: {', '.join(domain_filter)}"
275
+
276
+ prompt += f"""
277
+
278
+ OUTPUT FORMAT:
279
+ For each source, provide:
280
+ 1. **URL**: Direct link to the content
281
+ 2. **Title**: Clear, descriptive title
282
+ 3. **Description**: 2-3 sentence summary of content and why it's relevant
283
+ 4. **Type**: [academic/news/blog/government/technical/forum/social]
284
+ 5. **Quality Score**: 1-10 rating for dataset suitability
285
+
286
+ ADDITIONAL REQUIREMENTS:
287
+ - Verify URLs are accessible and contain substantial text
288
+ - Avoid paywalled or login-required content when possible
289
+ - Prioritize sources with consistent formatting
290
+ - Include publication dates when available
291
+ - Suggest related search terms for expanding the dataset
292
+
293
+ Please provide concrete, actionable sources that can be immediately scraped for dataset creation.
294
+ """
295
+
296
+ return prompt
297
+
298
+ def _parse_sources_from_response(self, content: str) -> List[SourceResult]:
299
+ """Parse source information from Perplexity AI response"""
300
+ sources = []
301
+
302
+ # Try to extract structured information
303
+ # Look for URL patterns
304
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+[^\s<>"{}|\\^`\[\].,!?;:]'
305
+
306
+ # Split content into sections
307
+ sections = re.split(r'\n\s*\n', content)
308
+
309
+ for section in sections:
310
+ # Look for URLs in this section
311
+ urls = re.findall(url_pattern, section, re.IGNORECASE)
312
+
313
+ if urls:
314
+ for url in urls:
315
+ try:
316
+ # Clean URL
317
+ url = url.strip()
318
+
319
+ # Extract title (look for text before the URL or after)
320
+ title = self._extract_title_from_section(section, url)
321
+
322
+ # Extract description
323
+ description = self._extract_description_from_section(section, url)
324
+
325
+ # Determine source type
326
+ source_type = self._determine_source_type(url, section)
327
+
328
+ # Calculate relevance score (basic heuristic)
329
+ relevance_score = self._calculate_relevance_score(section, url)
330
+
331
+ # Get domain
332
+ domain = self._extract_domain(url)
333
+
334
+ # Validate URL
335
+ if self._is_valid_url(url):
336
+ source = SourceResult(
337
+ url=url,
338
+ title=title,
339
+ description=description,
340
+ relevance_score=relevance_score,
341
+ source_type=source_type,
342
+ domain=domain
343
+ )
344
+ sources.append(source)
345
+
346
+ except Exception as e:
347
+ logger.debug(f"⚠️ Error parsing source: {str(e)}")
348
+ continue
349
+
350
+ # Remove duplicates based on URL
351
+ seen_urls = set()
352
+ unique_sources = []
353
+
354
+ for source in sources:
355
+ if source.url not in seen_urls:
356
+ seen_urls.add(source.url)
357
+ unique_sources.append(source)
358
+
359
+ # Sort by relevance score
360
+ unique_sources.sort(key=lambda x: x.relevance_score, reverse=True)
361
+
362
+ return unique_sources
363
+
364
+ def _extract_title_from_section(self, section: str, url: str) -> str:
365
+ """Extract title from section text"""
366
+ lines = section.split('\n')
367
+
368
+ for line in lines:
369
+ if url in line:
370
+ # Look for title patterns
371
+ title_patterns = [
372
+ r'\*\*([^*]+)\*\*', # **Title**
373
+ r'#{1,6}\s*([^\n]+)', # # Title
374
+ r'Title:\s*([^\n]+)', # Title: Something
375
+ r'([^:\n]+):?\s*' + re.escape(url), # Title: URL
376
+ ]
377
+
378
+ for pattern in title_patterns:
379
+ match = re.search(pattern, line, re.IGNORECASE)
380
+ if match:
381
+ return match.group(1).strip()
382
+
383
+ # Fallback: use domain name
384
+ return self._extract_domain(url)
385
+
386
+ def _extract_description_from_section(self, section: str, url: str) -> str:
387
+ """Extract description from section text"""
388
+ # Remove the URL line and look for descriptive text
389
+ lines = section.split('\n')
390
+ description_lines = []
391
+
392
+ for line in lines:
393
+ if url not in line and line.strip():
394
+ # Skip markdown headers and bullets
395
+ clean_line = re.sub(r'^[#*\-\d\.]+\s*', '', line.strip())
396
+ if len(clean_line) > 20: # Meaningful content
397
+ description_lines.append(clean_line)
398
+
399
+ description = ' '.join(description_lines)
400
+
401
+ # Truncate if too long
402
+ if len(description) > 200:
403
+ description = description[:200] + "..."
404
+
405
+ return description or "High-quality source for dataset creation"
406
+
407
+ def _determine_source_type(self, url: str, section: str) -> str:
408
+ """Determine the type of source based on URL and context"""
409
+ url_lower = url.lower()
410
+ section_lower = section.lower()
411
+
412
+ # Academic sources
413
+ if any(domain in url_lower for domain in [
414
+ 'arxiv.org', 'scholar.google', 'pubmed', 'ieee.org',
415
+ 'acm.org', 'springer.com', 'elsevier.com', 'nature.com',
416
+ 'sciencedirect.com', 'jstor.org'
417
+ ]):
418
+ return 'academic'
419
+
420
+ # News sources
421
+ if any(domain in url_lower for domain in [
422
+ 'cnn.com', 'bbc.com', 'reuters.com', 'ap.org', 'nytimes.com',
423
+ 'washingtonpost.com', 'theguardian.com', 'bloomberg.com',
424
+ 'techcrunch.com', 'wired.com'
425
+ ]):
426
+ return 'news'
427
+
428
+ # Government sources
429
+ if '.gov' in url_lower or 'government' in section_lower:
430
+ return 'government'
431
+
432
+ # Technical/Documentation
433
+ if any(domain in url_lower for domain in [
434
+ 'docs.', 'documentation', 'github.com', 'stackoverflow.com',
435
+ 'medium.com', 'dev.to'
436
+ ]):
437
+ return 'technical'
438
+
439
+ # Social media
440
+ if any(domain in url_lower for domain in [
441
+ 'twitter.com', 'reddit.com', 'linkedin.com', 'facebook.com'
442
+ ]):
443
+ return 'social'
444
+
445
+ # Default to blog
446
+ return 'blog'
447
+
448
+ def _calculate_relevance_score(self, section: str, url: str) -> float:
449
+ """Calculate relevance score for a source (0-10)"""
450
+ score = 5.0 # Base score
451
+
452
+ section_lower = section.lower()
453
+ url_lower = url.lower()
454
+
455
+ # Boost for quality indicators
456
+ quality_indicators = [
457
+ 'research', 'study', 'analysis', 'comprehensive', 'detailed',
458
+ 'expert', 'professional', 'authoritative', 'peer-reviewed',
459
+ 'dataset', 'data', 'machine learning', 'ai', 'artificial intelligence'
460
+ ]
461
+
462
+ for indicator in quality_indicators:
463
+ if indicator in section_lower:
464
+ score += 0.5
465
+
466
+ # Boost for academic sources
467
+ if any(domain in url_lower for domain in ['arxiv.org', 'scholar.google', 'pubmed']):
468
+ score += 2.0
469
+
470
+ # Boost for government sources
471
+ if '.gov' in url_lower:
472
+ score += 1.5
473
+
474
+ # Penalize for social media
475
+ if any(domain in url_lower for domain in ['twitter.com', 'facebook.com']):
476
+ score -= 1.0
477
+
478
+ # Cap at 10
479
+ return min(score, 10.0)
480
+
481
+ def _extract_domain(self, url: str) -> str:
482
+ """Extract domain from URL"""
483
+ try:
484
+ parsed = urlparse(url)
485
+ return parsed.netloc
486
+ except:
487
+ return "unknown"
488
+
489
+ def _is_valid_url(self, url: str) -> bool:
490
+ """Validate URL format and basic accessibility"""
491
+ try:
492
+ parsed = urlparse(url)
493
+ return all([parsed.scheme, parsed.netloc])
494
+ except:
495
+ return False
496
+
497
+ def _extract_suggestions(self, content: str) -> List[str]:
498
+ """Extract search suggestions from Perplexity response"""
499
+ suggestions = []
500
+
501
+ # Look for suggestion patterns
502
+ suggestion_patterns = [
503
+ r'related search terms?:?\s*([^\n]+)',
504
+ r'you might also search for:?\s*([^\n]+)',
505
+ r'additional keywords?:?\s*([^\n]+)',
506
+ r'suggestions?:?\s*([^\n]+)'
507
+ ]
508
+
509
+ for pattern in suggestion_patterns:
510
+ matches = re.findall(pattern, content, re.IGNORECASE)
511
+ for match in matches:
512
+ # Split by common delimiters
513
+ terms = re.split(r'[,;|]', match)
514
+ suggestions.extend([term.strip().strip('"\'') for term in terms if term.strip()])
515
+
516
+ return suggestions[:10] # Limit to 10 suggestions
517
+
518
+ def _create_empty_results(self, query: str, search_time: float) -> SearchResults:
519
+ """Create empty results object for failed searches"""
520
+ return SearchResults(
521
+ query=query,
522
+ sources=[],
523
+ total_found=0,
524
+ search_time=search_time,
525
+ perplexity_response="",
526
+ suggestions=[]
527
+ )
528
+
529
+ def search_with_keywords(self, keywords: List[str], search_type: SearchType = SearchType.GENERAL) -> SearchResults:
530
+ """
531
+ πŸ”Ž Search using specific keywords
532
+
533
+ Args:
534
+ keywords: List of search keywords
535
+ search_type: Type of search to perform
536
+
537
+ Returns:
538
+ SearchResults object
539
+ """
540
+ query = " ".join(keywords)
541
+ return self.discover_sources(
542
+ project_description=f"Find sources related to: {query}",
543
+ search_type=search_type
544
+ )
545
+
546
+ def get_domain_sources(self, domain: str, topic: str, max_sources: int = 10) -> SearchResults:
547
+ """
548
+ 🌐 Find sources from a specific domain
549
+
550
+ Args:
551
+ domain: Target domain (e.g., "nature.com")
552
+ topic: Topic to search for
553
+ max_sources: Maximum sources to return
554
+
555
+ Returns:
556
+ SearchResults object
557
+ """
558
+ return self.discover_sources(
559
+ project_description=f"Find articles about {topic} from {domain}",
560
+ domain_filter=[domain],
561
+ max_sources=max_sources
562
+ )
563
+
564
+ def validate_sources(self, sources: List[SourceResult]) -> List[SourceResult]:
565
+ """
566
+ βœ… Validate and filter sources for quality and accessibility
567
+
568
+ Args:
569
+ sources: List of source results to validate
570
+
571
+ Returns:
572
+ Filtered list of valid sources
573
+ """
574
+ valid_sources = []
575
+
576
+ for source in sources:
577
+ try:
578
+ # Basic URL validation
579
+ if not self._is_valid_url(source.url):
580
+ logger.debug(f"⚠️ Invalid URL: {source.url}")
581
+ continue
582
+
583
+ # Check if domain is accessible (basic check)
584
+ domain = self._extract_domain(source.url)
585
+ if not domain or domain == "unknown":
586
+ logger.debug(f"⚠️ Unknown domain: {source.url}")
587
+ continue
588
+
589
+ # Quality score threshold
590
+ if source.relevance_score < 3.0:
591
+ logger.debug(f"⚠️ Low quality score: {source.url}")
592
+ continue
593
+
594
+ valid_sources.append(source)
595
+
596
+ except Exception as e:
597
+ logger.debug(f"⚠️ Error validating source {source.url}: {str(e)}")
598
+ continue
599
+
600
+ logger.info(f"βœ… Validated {len(valid_sources)} out of {len(sources)} sources")
601
+ return valid_sources
602
+
603
+ def export_sources(self, results: SearchResults, format: str = "json") -> str:
604
+ """
605
+ πŸ“„ Export search results to various formats
606
+
607
+ Args:
608
+ results: SearchResults object to export
609
+ format: Export format ("json", "csv", "markdown")
610
+
611
+ Returns:
612
+ Exported data as string
613
+ """
614
+ if format.lower() == "json":
615
+ return self._export_json(results)
616
+ elif format.lower() == "csv":
617
+ return self._export_csv(results)
618
+ elif format.lower() == "markdown":
619
+ return self._export_markdown(results)
620
+ else:
621
+ raise ValueError(f"Unsupported export format: {format}")
622
+
623
+ def _export_json(self, results: SearchResults) -> str:
624
+ """Export results as JSON"""
625
+ data = {
626
+ "query": results.query,
627
+ "total_found": results.total_found,
628
+ "search_time": results.search_time,
629
+ "sources": [
630
+ {
631
+ "url": source.url,
632
+ "title": source.title,
633
+ "description": source.description,
634
+ "relevance_score": source.relevance_score,
635
+ "source_type": source.source_type,
636
+ "domain": source.domain,
637
+ "publication_date": source.publication_date,
638
+ "author": source.author
639
+ }
640
+ for source in results.sources
641
+ ],
642
+ "suggestions": results.suggestions
643
+ }
644
+ return json.dumps(data, indent=2)
645
+
646
+ def _export_csv(self, results: SearchResults) -> str:
647
+ """Export results as CSV"""
648
+ import csv
649
+ from io import StringIO
650
+
651
+ output = StringIO()
652
+ writer = csv.writer(output)
653
+
654
+ # Write header
655
+ writer.writerow([
656
+ "URL", "Title", "Description", "Relevance Score",
657
+ "Source Type", "Domain", "Publication Date", "Author"
658
+ ])
659
+
660
+ # Write data
661
+ for source in results.sources:
662
+ writer.writerow([
663
+ source.url,
664
+ source.title,
665
+ source.description,
666
+ source.relevance_score,
667
+ source.source_type,
668
+ source.domain,
669
+ source.publication_date or "",
670
+ source.author or ""
671
+ ])
672
+
673
+ return output.getvalue()
674
+
675
+ def _export_markdown(self, results: SearchResults) -> str:
676
+ """Export results as Markdown"""
677
+ md = f"# Search Results for: {results.query}\n\n"
678
+ md += f"**Total Sources Found:** {results.total_found}\n"
679
+ md += f"**Search Time:** {results.search_time:.2f} seconds\n\n"
680
+
681
+ md += "## Sources\n\n"
682
+
683
+ for i, source in enumerate(results.sources, 1):
684
+ md += f"### {i}. {source.title}\n\n"
685
+ md += f"**URL:** {source.url}\n"
686
+ md += f"**Type:** {source.source_type}\n"
687
+ md += f"**Domain:** {source.domain}\n"
688
+ md += f"**Relevance Score:** {source.relevance_score}/10\n"
689
+ md += f"**Description:** {source.description}\n\n"
690
+
691
+ if results.suggestions:
692
+ md += "## Related Search Suggestions\n\n"
693
+ for suggestion in results.suggestions:
694
+ md += f"- {suggestion}\n"
695
+
696
+ return md
697
+
698
+ # Example usage and testing functions
699
+ def test_perplexity_client():
700
+ """Test function for Perplexity client"""
701
+ client = PerplexityClient()
702
+
703
+ if not client._validate_api_key():
704
+ print("❌ No API key found. Set PERPLEXITY_API_KEY environment variable.")
705
+ return
706
+
707
+ # Test search
708
+ results = client.discover_sources(
709
+ project_description="Create a dataset for sentiment analysis of product reviews",
710
+ search_type=SearchType.GENERAL,
711
+ max_sources=10
712
+ )
713
+
714
+ print(f"πŸ” Found {len(results.sources)} sources")
715
+ for source in results.sources[:3]:
716
+ print(f" - {source.title}: {source.url}")
717
+
718
+ # Test export
719
+ json_export = client.export_sources(results, "json")
720
+ print(f"πŸ“„ JSON export: {len(json_export)} characters")
721
+
722
+ if __name__ == "__main__":
723
+ # Test the client
724
+ test_perplexity_client()