shukdevdatta123 commited on
Commit
81a5137
Β·
verified Β·
1 Parent(s): 533b217

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +270 -48
app.py CHANGED
@@ -17,6 +17,8 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
17
  class WebScrapingTool:
18
  def __init__(self):
19
  self.client = None
 
 
20
  self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
21
  1. Navigating and reading web page content from a given URL.
22
  2. Extracting textual content including headings, paragraphs, lists, and metadata.
@@ -39,6 +41,24 @@ You must not hallucinate or infer data not present on the page. If content is mi
39
  Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
40
  Your role is to act as an intelligent browser and data interpreter β€” able to read and reshape any web content to meet user needs."""
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def setup_client(self, api_key):
43
  """Initialize OpenAI client with OpenRouter"""
44
  try:
@@ -270,6 +290,9 @@ Your role is to act as an intelligent browser and data interpreter β€” able to r
270
  if not scraped_data['success']:
271
  return f"Error scraping webpage: {scraped_data['error']}"
272
 
 
 
 
273
  # Prepare content for AI analysis
274
  content_text = f"""
275
  WEBPAGE ANALYSIS REQUEST
@@ -315,11 +338,84 @@ MAIN CONTENT:
315
  max_tokens=4000
316
  )
317
 
318
- return completion.choices[0].message.content
 
 
 
319
 
320
  except Exception as e:
321
  return f"Error analyzing content with AI: {str(e)}"
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  def create_interface():
324
  tool = WebScrapingTool()
325
 
@@ -357,13 +453,27 @@ def create_interface():
357
 
358
  yield f"βœ… Analysis Complete!\n{'='*50}\n\n{result}"
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  # Create Gradio interface
361
- with gr.Blocks(title="AI Web Scraping Tool", theme=gr.themes.Soft()) as app:
362
  gr.Markdown("""
363
- # πŸ€– AI Web Scraping Tool
364
- ### Powered by DeepSeek V3 & OpenRouter
365
 
366
- Extract and analyze web content using advanced AI. The tool handles timeouts, SSL issues, and provides robust scraping capabilities.
367
  """)
368
 
369
  with gr.Row():
@@ -390,6 +500,9 @@ def create_interface():
390
 
391
  with gr.Row():
392
  analyze_btn = gr.Button("πŸš€ Analyze Website", variant="primary", size="lg")
 
 
 
393
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
394
 
395
  with gr.Column(scale=3):
@@ -401,32 +514,140 @@ def create_interface():
401
  interactive=False,
402
  placeholder="Results will appear here after analysis..."
403
  )
 
 
 
 
 
 
 
 
 
404
 
405
  # Tips and Examples
406
- with gr.Accordion("πŸ’‘ Usage Tips & Examples", open=False):
407
  gr.Markdown("""
408
- ### 🎯 Example Analysis Queries:
409
- - **Data Extraction**: *"Extract all numerical data and organize it in a table format"*
410
- - **Content Summary**: *"Summarize the main points in bullet format with key statistics"*
411
- - **Table Processing**: *"Find all tables and convert them to a single consolidated format"*
412
- - **Specific Information**: *"Extract contact information, prices, or product details"*
413
- - **Comparison**: *"Compare different items/options mentioned and create a comparison table"*
414
-
415
- ### πŸ”§ Technical Notes:
416
- - **Multiple Timeouts**: Tool tries 15s, 30s, then 45s timeouts automatically
417
- - **SSL Handling**: Bypasses SSL issues for problematic websites
418
- - **Content Filtering**: Removes ads, popups, and unnecessary elements
419
- - **Table Detection**: Automatically finds and structures tabular data
420
- - **Error Recovery**: Handles connection issues and provides clear error messages
421
-
422
- ### 🌐 Works Well With:
423
- - News websites (BBC, CNN, Reuters)
424
- - Government sites (IMF, WHO, official statistics)
425
- - Wikipedia and educational content
426
- - E-commerce product pages
427
- - Financial data sites (Yahoo Finance, MarketWatch)
428
- - Research papers and academic sites
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  ## πŸ§ͺ **Test Scenarios**
431
 
432
  ### **1. News & Media Sites**
@@ -555,28 +776,22 @@ def create_interface():
555
  Query: Extract UK weather information and create a regional breakdown of current conditions
556
  ```
557
 
558
- ## πŸ” **Testing Tips:**
559
-
560
- 1. **Start Simple**: Begin with basic sites like Wikipedia or news sites
561
- 2. **Test Error Handling**: Try invalid URLs to see error messages
562
- 3. **Check Timeouts**: Use slow-loading sites to test timeout handling
563
- 4. **Verify Tables**: Test sites with different table structures
564
- 5. **Content Variety**: Try different content types (news, data, products)
565
 
566
- ## 🚨 **Sites That May Have Issues:**
567
- - Social media sites (require login)
568
- - Sites with heavy JavaScript (may have limited content)
569
- - Sites with aggressive bot protection
570
- - Password-protected pages
 
571
 
572
- ## βœ… **Reliable Test Sites:**
573
- - Wikipedia (excellent for tables and structured content)
574
- - BBC News (good for text extraction)
575
- - Government sites (.gov domains)
576
- - W3Schools (great for HTML table testing)
577
- - HttpBin (perfect for testing basic functionality)
578
 
579
- Start with the simpler tests and gradually move to more complex scenarios to fully evaluate your tool's capabilities!
580
  """)
581
 
582
  # Event handlers
@@ -587,9 +802,16 @@ def create_interface():
587
  show_progress=True
588
  )
589
 
 
 
 
 
 
 
 
590
  clear_btn.click(
591
- fn=lambda: ("", "", "", ""),
592
- outputs=[api_key_input, url_input, query_input, output]
593
  )
594
 
595
  return app
 
17
  class WebScrapingTool:
18
  def __init__(self):
19
  self.client = None
20
+ self.scraped_data = None # Store scraped data for fact-checking
21
+ self.analysis_result = None # Store analysis result for fact-checking
22
  self.system_prompt = """You are a specialized web data extraction assistant. Your core purpose is to browse and analyze the content of web pages based on user instructions, and return structured or unstructured information from the provided URL. Your capabilities include:
23
  1. Navigating and reading web page content from a given URL.
24
  2. Extracting textual content including headings, paragraphs, lists, and metadata.
 
41
  Always respond based on the actual content from the provided link. If the page fails to load or cannot be accessed, inform the user immediately.
42
  Your role is to act as an intelligent browser and data interpreter β€” able to read and reshape any web content to meet user needs."""
43
 
44
+ self.factcheck_prompt = """You are an expert fact-checker and critical analysis assistant. Your role is to thoroughly examine AI-generated analysis results against the original source material to verify accuracy, identify potential errors, and assess the reliability of the analysis.
45
+
46
+ Your fact-checking responsibilities include:
47
+ 1. **Accuracy Verification**: Compare each claim, statistic, and piece of information in the analysis against the original source content.
48
+ 2. **Completeness Assessment**: Determine if important information was missed or if the analysis covers all relevant aspects.
49
+ 3. **Error Detection**: Identify factual errors, misinterpretations, or misrepresentations of the source material.
50
+ 4. **Context Verification**: Ensure that information is presented in proper context and not taken out of context.
51
+ 5. **Consistency Check**: Verify that the analysis is internally consistent and doesn't contain contradictions.
52
+
53
+ For your fact-checking analysis, provide:
54
+ - **ACCURACY SCORE**: Rate the overall accuracy on a scale of 1-10 (10 being perfectly accurate)
55
+ - **KEY FINDINGS**: List what was correctly analyzed
56
+ - **ERRORS IDENTIFIED**: Point out any inaccuracies, misrepresentations, or missing information
57
+ - **VERIFICATION STATUS**: For each major claim, indicate whether it's VERIFIED, PARTIALLY VERIFIED, or CANNOT VERIFY
58
+ - **RECOMMENDATIONS**: Suggest improvements or corrections needed
59
+
60
+ Be thorough, objective, and provide specific examples when pointing out discrepancies. If the analysis is accurate, acknowledge its quality. If there are issues, be clear about what needs correction."""
61
+
62
  def setup_client(self, api_key):
63
  """Initialize OpenAI client with OpenRouter"""
64
  try:
 
290
  if not scraped_data['success']:
291
  return f"Error scraping webpage: {scraped_data['error']}"
292
 
293
+ # Store scraped data for fact-checking
294
+ self.scraped_data = scraped_data
295
+
296
  # Prepare content for AI analysis
297
  content_text = f"""
298
  WEBPAGE ANALYSIS REQUEST
 
338
  max_tokens=4000
339
  )
340
 
341
+ result = completion.choices[0].message.content
342
+ # Store analysis result for fact-checking
343
+ self.analysis_result = result
344
+ return result
345
 
346
  except Exception as e:
347
  return f"Error analyzing content with AI: {str(e)}"
348
 
349
+ def fact_check_analysis(self, api_key):
350
+ """Fact-check the analysis results using DeepSeek R1"""
351
+ if not self.client:
352
+ success, message = self.setup_client(api_key)
353
+ if not success:
354
+ return f"Error: {message}"
355
+
356
+ if not self.scraped_data or not self.analysis_result:
357
+ return "❌ No analysis results to fact-check. Please run an analysis first."
358
+
359
+ # Prepare content for fact-checking
360
+ factcheck_content = f"""
361
+ FACT-CHECKING TASK
362
+ ==================
363
+
364
+ ORIGINAL SOURCE MATERIAL:
365
+ -------------------------
366
+ URL: {self.scraped_data['url']}
367
+ Title: {self.scraped_data['title']}
368
+ Content Length: {self.scraped_data['content_length']} characters
369
+
370
+ SOURCE TEXT:
371
+ {self.scraped_data['text']}
372
+ """
373
+
374
+ if self.scraped_data['tables']:
375
+ factcheck_content += f"\n\nSOURCE TABLES ({len(self.scraped_data['tables'])} found):\n"
376
+ factcheck_content += "=" * 50 + "\n"
377
+
378
+ for table in self.scraped_data['tables']:
379
+ factcheck_content += f"\nTABLE {table['id']}:\n"
380
+ factcheck_content += f"Headers: {' | '.join(table['headers'])}\n"
381
+ factcheck_content += "-" * 50 + "\n"
382
+
383
+ for i, row in enumerate(table['data'][:15]): # Show more rows for fact-checking
384
+ factcheck_content += f"Row {i+1}: {' | '.join(str(cell) for cell in row)}\n"
385
+
386
+ if len(table['data']) > 15:
387
+ factcheck_content += f"... and {len(table['data']) - 15} more rows\n"
388
+ factcheck_content += "\n"
389
+
390
+ factcheck_content += f"""
391
+
392
+ AI ANALYSIS TO VERIFY:
393
+ ======================
394
+ {self.analysis_result}
395
+
396
+ FACT-CHECKING INSTRUCTIONS:
397
+ ===========================
398
+ Please thoroughly fact-check the AI analysis above against the original source material. Verify every claim, statistic, and piece of information. Provide a comprehensive fact-checking report."""
399
+
400
+ try:
401
+ completion = self.client.chat.completions.create(
402
+ extra_headers={
403
+ "HTTP-Referer": "https://gradio-web-scraper.com",
404
+ "X-Title": "AI Web Scraping Tool - Fact Checker",
405
+ },
406
+ extra_body={},
407
+ model="deepseek/deepseek-r1:free",
408
+ messages=[
409
+ {"role": "system", "content": self.factcheck_prompt},
410
+ {"role": "user", "content": factcheck_content}
411
+ ]
412
+ )
413
+
414
+ return completion.choices[0].message.content
415
+
416
+ except Exception as e:
417
+ return f"Error fact-checking with DeepSeek R1: {str(e)}"
418
+
419
  def create_interface():
420
  tool = WebScrapingTool()
421
 
 
453
 
454
  yield f"βœ… Analysis Complete!\n{'='*50}\n\n{result}"
455
 
456
+ def fact_check_request(api_key):
457
+ if not api_key.strip():
458
+ return "❌ Please enter your OpenRouter API key"
459
+
460
+ yield "πŸ” Starting fact-check with DeepSeek R1..."
461
+ time.sleep(0.5)
462
+
463
+ yield "🧠 Analyzing accuracy and verifying claims..."
464
+
465
+ # Perform fact-checking
466
+ factcheck_result = tool.fact_check_analysis(api_key)
467
+
468
+ yield f"βœ… Fact-Check Complete!\n{'='*50}\n\n{factcheck_result}"
469
+
470
  # Create Gradio interface
471
+ with gr.Blocks(title="AI Web Scraping Tool with Fact-Checking", theme=gr.themes.Soft()) as app:
472
  gr.Markdown("""
473
+ # πŸ€– AI Web Scraping Tool with Fact-Checking
474
+ ### Powered by DeepSeek V3 & DeepSeek R1 via OpenRouter
475
 
476
+ Extract and analyze web content using advanced AI, then fact-check the results for accuracy and reliability.
477
  """)
478
 
479
  with gr.Row():
 
500
 
501
  with gr.Row():
502
  analyze_btn = gr.Button("πŸš€ Analyze Website", variant="primary", size="lg")
503
+ factcheck_btn = gr.Button("πŸ” Fact-Check Results", variant="secondary", size="lg")
504
+
505
+ with gr.Row():
506
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
507
 
508
  with gr.Column(scale=3):
 
514
  interactive=False,
515
  placeholder="Results will appear here after analysis..."
516
  )
517
+
518
+ factcheck_output = gr.Textbox(
519
+ label="πŸ” Fact-Check Report",
520
+ lines=20,
521
+ max_lines=40,
522
+ show_copy_button=True,
523
+ interactive=False,
524
+ placeholder="Fact-check results will appear here after clicking 'Fact-Check Results'..."
525
+ )
526
 
527
  # Tips and Examples
528
+ with gr.Accordion("πŸ’‘ Usage Tips & Fact-Checking Guide", open=False):
529
  gr.Markdown("""
530
+ ## πŸ”„ **How to Use the Fact-Checking Feature:**
531
+
532
+ 1. **First**: Enter your API key, URL, and analysis query
533
+ 2. **Second**: Click "πŸš€ Analyze Website" to get initial results
534
+ 3. **Third**: Click "πŸ” Fact-Check Results" to verify accuracy with DeepSeek R1
535
+
536
+ ## 🎯 **What the Fact-Checker Does:**
537
+
538
+ ### **Accuracy Verification**
539
+ - Compares every claim in the analysis against the original source
540
+ - Identifies factual errors and misrepresentations
541
+ - Verifies numerical data and statistics
542
+
543
+ ### **Completeness Assessment**
544
+ - Checks if important information was missed
545
+ - Evaluates coverage of all relevant aspects
546
+ - Identifies gaps in the analysis
547
+
548
+ ### **Context Verification**
549
+ - Ensures information isn't taken out of context
550
+ - Verifies proper interpretation of source material
551
+ - Checks for misleading presentations
552
+
553
+ ### **Quality Scoring**
554
+ - Provides accuracy scores (1-10 scale)
555
+ - Lists verified vs. unverified claims
556
+ - Offers specific recommendations for improvement
557
+
558
+ ## πŸ§ͺ **Best Practices for Fact-Checking:**
559
+
560
+ ### **Ideal Test Cases:**
561
+ ```
562
+ URL: https://en.wikipedia.org/wiki/List_of_countries_by_population
563
+ Query: Create a table showing the top 10 most populous countries with their exact population figures
564
+ ```
565
+ *Perfect for fact-checking numerical accuracy*
566
+
567
+ ```
568
+ URL: https://www.who.int/news-room/fact-sheets
569
+ Query: Extract key health statistics and create a summary of global health metrics
570
+ ```
571
+ *Great for verifying official statistics*
572
+
573
+ ```
574
+ URL: https://finance.yahoo.com/quote/AAPL
575
+ Query: Extract Apple's current stock price, market cap, and financial metrics
576
+ ```
577
+ *Excellent for checking real-time financial data accuracy*
578
+
579
+ ## 🎯 **Example Analysis Queries for Fact-Checking:**
580
+
581
+ ### **Data-Heavy Content**
582
+ - *"Extract all numerical data and organize it in a table format"*
583
+ - *"Create a comparison table of different countries' GDP figures"*
584
+ - *"List the top 10 items with their exact values from the source"*
585
 
586
+ ### **Statistical Information**
587
+ - *"Summarize key statistics with specific numbers and percentages"*
588
+ - *"Extract survey results and present the exact figures"*
589
+ - *"Create a timeline with specific dates and events"*
590
+
591
+ ### **Complex Analysis**
592
+ - *"Compare different viewpoints and cite specific quotes"*
593
+ - *"Extract cause-and-effect relationships mentioned in the article"*
594
+ - *"Summarize research findings with methodology details"*
595
+
596
+ ## πŸ” **What Gets Fact-Checked:**
597
+
598
+ βœ… **Verified Items:**
599
+ - Exact quotes and citations
600
+ - Numerical data and statistics
601
+ - Dates, names, and factual claims
602
+ - Table data accuracy
603
+ - Mathematical calculations
604
+
605
+ ⚠️ **Flagged Issues:**
606
+ - Misquoted information
607
+ - Incorrect numbers or percentages
608
+ - Missing context or nuance
609
+ - Overgeneralized statements
610
+ - Unsupported conclusions
611
+
612
+ ## 🚨 **Red Flags the Fact-Checker Catches:**
613
+
614
+ - **Hallucinated Data**: Information not present in the source
615
+ - **Misattributed Quotes**: Quotes assigned to wrong sources
616
+ - **Mathematical Errors**: Incorrect calculations or summaries
617
+ - **Context Loss**: Information presented without proper context
618
+ - **Incomplete Extraction**: Missing important details from tables
619
+
620
+ ## πŸ’‘ **Tips for Better Fact-Checking:**
621
+
622
+ 1. **Use Specific Queries**: More specific requests = better fact-checking
623
+ 2. **Test with Known Data**: Start with sites where you know the content
624
+ 3. **Check Complex Tables**: Tables are great for testing accuracy
625
+ 4. **Verify Names & Dates**: These are common error points
626
+ 5. **Cross-Reference**: Compare with multiple sources when possible
627
+
628
+ ## πŸ”¬ **Advanced Fact-Checking Tests:**
629
+
630
+ ### **Financial Data Test**
631
+ ```
632
+ URL: https://finance.yahoo.com/quote/MSFT
633
+ Query: Create a detailed financial summary table with exact figures for Microsoft stock
634
+ Expected: Fact-checker should verify all numbers match the source exactly
635
+ ```
636
+
637
+ ### **Statistical Data Test**
638
+ ```
639
+ URL: https://www.census.gov/quickfacts/fact/table/US
640
+ Query: Extract US population demographics with specific percentages
641
+ Expected: Fact-checker should confirm all demographic percentages are accurate
642
+ ```
643
+
644
+ ### **Historical Data Test**
645
+ ```
646
+ URL: https://en.wikipedia.org/wiki/List_of_Presidents_of_the_United_States
647
+ Query: Create a table of the last 10 US presidents with their exact terms of office
648
+ Expected: Fact-checker should verify all dates and names are correct
649
+ ```
650
+
651
  ## πŸ§ͺ **Test Scenarios**
652
 
653
  ### **1. News & Media Sites**
 
776
  Query: Extract UK weather information and create a regional breakdown of current conditions
777
  ```
778
 
779
+ ## 🎯 **Interpreting Fact-Check Results:**
 
 
 
 
 
 
780
 
781
+ ### **Accuracy Scores:**
782
+ - **9-10**: Highly accurate, minimal issues
783
+ - **7-8**: Generally accurate with minor corrections needed
784
+ - **5-6**: Moderate accuracy, several issues to address
785
+ - **3-4**: Low accuracy, significant problems found
786
+ - **1-2**: Poor accuracy, major fact-checking failures
787
 
788
+ ### **Verification Status:**
789
+ - **βœ… VERIFIED**: Claim matches source exactly
790
+ - **⚠️ PARTIALLY VERIFIED**: Claim is mostly correct but lacks nuance
791
+ - **❌ CANNOT VERIFY**: Claim not supported by source material
792
+ - **🚨 CONTRADICTED**: Claim directly contradicts source
 
793
 
794
+ Remember: The fact-checker is designed to be thorough and critical. Even high-quality analyses may receive suggestions for improvement!
795
  """)
796
 
797
  # Event handlers
 
802
  show_progress=True
803
  )
804
 
805
+ factcheck_btn.click(
806
+ fn=fact_check_request,
807
+ inputs=[api_key_input],
808
+ outputs=factcheck_output,
809
+ show_progress=True
810
+ )
811
+
812
  clear_btn.click(
813
+ fn=lambda: ("", "", "", "", ""),
814
+ outputs=[api_key_input, url_input, query_input, output, factcheck_output]
815
  )
816
 
817
  return app