Spaces:
Running
Running
File size: 6,505 Bytes
a9d5552 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import unittest
from bs4 import BeautifulSoup
from ..scrapers.article_scraper import ArticleScraper
class TestArticleScraper(unittest.TestCase):
def setUp(self):
self.scraper = ArticleScraper()
def test_process_element_formatting(self):
"""Test that _process_element preserves various HTML formatting."""
# Test complex nested HTML with multiple formatting elements
html = """
<div>
<h1>Main Title</h1>
<p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
<p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
<ul>
<li>First <strong>important</strong> item</li>
<li>Second item with <em>emphasis</em></li>
</ul>
<ol>
<li>Numbered item <a href="test.com">with link</a></li>
<li>Another numbered item</li>
</ol>
<div>
Nested <br/>content with<br />line breaks
</div>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
formatted_content = self.scraper._process_element(soup.div)
expected_output = """
## Main Title
This is a **bold** and _italic_ text.
This is a [link](https://example.com) in a paragraph.
• First **important** item
• Second item with _emphasis_
1. Numbered item [with link](test.com)
2. Another numbered item
Nested
content with
line breaks""".strip()
# Normalize whitespace for comparison
formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
self.assertEqual(formatted_content, expected_output)
def test_extract_snopes_article(self):
"""Test extraction of a Snopes-style article with formatting."""
html = """
<html>
<body>
<header>
<h1>Fact Check: Test Claim</h1>
</header>
<article>
<h2>The Claim</h2>
<p>This is the <strong>main claim</strong> being tested.</p>
<h2>The Facts</h2>
<ul>
<li>First important fact with <em>emphasis</em></li>
<li>Second fact with a <a href="source.com">source</a></li>
</ul>
<p>Additional <strong>important</strong> context.</p>
</article>
</body>
</html>
"""
soup = BeautifulSoup(html, 'html.parser')
result = self.scraper._extract_article(soup, 'snopes.com')
expected_content = """
## The Claim
This is the **main claim** being tested.
## The Facts
• First important fact with _emphasis_
• Second fact with a [source](source.com)
Additional **important** context.""".strip()
self.assertEqual(result['headline'], 'Fact Check: Test Claim')
# Normalize whitespace for comparison
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
self.assertEqual(actual_content, expected_content)
def test_extract_politifact_article(self):
"""Test extraction of a PolitiFact-style article with formatting."""
html = """
<html>
<body>
<h1 class="article__title">Test Political Claim</h1>
<article class="article">
<div class="article__text">
<p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
<h3>Our Analysis</h3>
<ul>
<li>Evidence point 1</li>
<li>Evidence point 2 with <a href="proof.com">proof</a></li>
</ul>
<p>Final assessment with <strong>key points</strong>.</p>
</div>
</article>
</body>
</html>
"""
soup = BeautifulSoup(html, 'html.parser')
result = self.scraper._extract_article(soup, 'politifact.com')
expected_content = """
Here's a claim with **bold text** and _italics_.
### Our Analysis
• Evidence point 1
• Evidence point 2 with [proof](proof.com)
Final assessment with **key points**.""".strip()
self.assertEqual(result['headline'], 'Test Political Claim')
# Normalize whitespace for comparison
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
self.assertEqual(actual_content, expected_content)
def test_extract_generic_article(self):
"""Test extraction of a generic article with formatting."""
html = """
<html>
<body>
<h1>Generic Article Title</h1>
<main>
<p>Opening paragraph with <strong>bold</strong> text.</p>
<div class="content">
<h2>Section Title</h2>
<p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
<ul>
<li>Point <strong>one</strong></li>
<li>Point <em>two</em></li>
</ul>
</div>
</main>
</body>
</html>
"""
soup = BeautifulSoup(html, 'html.parser')
result = self.scraper._extract_article(soup, 'generic.com')
expected_content = """
Opening paragraph with **bold** text.
## Section Title
Content with _italic_ text and [reference](ref.com).
• Point **one**
• Point _two_""".strip()
self.assertEqual(result['headline'], 'Generic Article Title')
# Normalize whitespace for comparison
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
self.assertEqual(actual_content, expected_content)
if __name__ == '__main__':
unittest.main() |