File size: 6,920 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import asyncio
from bs4 import BeautifulSoup
from typing import Dict, Any
import os
import sys
import time
import csv
from tabulate import tabulate
from dataclasses import dataclass
from typing import List, Dict

parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

from crawl4ai.content_scraping_strategy import WebScrapingStrategy
from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent

@dataclass
class TestResult:
    name: str
    success: bool
    images: int
    internal_links: int
    external_links: int
    markdown_length: int
    execution_time: float

class StrategyTester:
    def __init__(self):
        self.new_scraper = WebScrapingStrategy()
        self.current_scraper = WebScrapingStrategyCurrent()
        with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
            self.WIKI_HTML = f.read()
        self.results = {'new': [], 'current': []}
        
    def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
        results = []
        for scraper in [self.new_scraper, self.current_scraper]:
            start_time = time.time()
            result = scraper._get_content_of_website_optimized(
                url="https://en.wikipedia.org/wiki/Test",
                html=self.WIKI_HTML,
                **kwargs
            )
            execution_time = time.time() - start_time
            
            test_result = TestResult(
                name=name,
                success=result['success'],
                images=len(result['media']['images']),
                internal_links=len(result['links']['internal']),
                external_links=len(result['links']['external']),
                markdown_length=len(result['markdown']),
                execution_time=execution_time
            )
            results.append(test_result)
        
        return results[0], results[1]  # new, current

    def run_all_tests(self):
        test_cases = [
            ("Basic Extraction", {}),
            ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
            ("Word Threshold", {'word_count_threshold': 50}),
            ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
            ("Link Exclusions", {
                'exclude_external_links': True,
                'exclude_social_media_links': True,
                'exclude_domains': ['facebook.com', 'twitter.com']
            }),
            ("Media Handling", {
                'exclude_external_images': True,
                'image_description_min_word_threshold': 20
            }),
            ("Text Only", {
                'only_text': True,
                'remove_forms': True
            }),
            ("HTML Cleaning", {
                'clean_html': True,
                'keep_data_attributes': True
            }),
            ("HTML2Text Options", {
                'html2text': {
                    'skip_internal_links': True,
                    'single_line_break': True,
                    'mark_code': True,
                    'preserve_tags': ['pre', 'code']
                }
            })
        ]

        all_results = []
        for name, kwargs in test_cases:
            try:
                new_result, current_result = self.run_test(name, **kwargs)
                all_results.append((name, new_result, current_result))
            except Exception as e:
                print(f"Error in {name}: {str(e)}")
                
        self.save_results_to_csv(all_results)
        self.print_comparison_table(all_results)

    def save_results_to_csv(self, all_results: List[tuple]):
        csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
        with open(csv_file, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
                           'External Links', 'Markdown Length', 'Execution Time'])
            
            for name, new_result, current_result in all_results:
                writer.writerow([name, 'New', new_result.success, new_result.images,
                               new_result.internal_links, new_result.external_links,
                               new_result.markdown_length, f"{new_result.execution_time:.3f}"])
                writer.writerow([name, 'Current', current_result.success, current_result.images,
                               current_result.internal_links, current_result.external_links,
                               current_result.markdown_length, f"{current_result.execution_time:.3f}"])

    def print_comparison_table(self, all_results: List[tuple]):
        table_data = []
        headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
                  'External Links', 'Markdown Length', 'Time (s)']

        for name, new_result, current_result in all_results:
            # Check for differences
            differences = []
            if new_result.images != current_result.images: differences.append('images')
            if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
            if new_result.external_links != current_result.external_links: differences.append('external_links')
            if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')
            
            # Add row for new strategy
            new_row = [
                name, 'New', new_result.success, new_result.images,
                new_result.internal_links, new_result.external_links,
                new_result.markdown_length, f"{new_result.execution_time:.3f}"
            ]
            table_data.append(new_row)
            
            # Add row for current strategy
            current_row = [
                '', 'Current', current_result.success, current_result.images,
                current_result.internal_links, current_result.external_links,
                current_result.markdown_length, f"{current_result.execution_time:.3f}"
            ]
            table_data.append(current_row)
            
            # Add difference summary if any
            if differences:
                table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])
            
            # Add empty row for better readability
            table_data.append([''] * len(headers))

        print("\nStrategy Comparison Results:")
        print(tabulate(table_data, headers=headers, tablefmt='grid'))

if __name__ == "__main__":
    tester = StrategyTester()
    tester.run_all_tests()