Spaces:
Running
Running
import unittest | |
from bs4 import BeautifulSoup | |
from ..scrapers.article_scraper import ArticleScraper | |
class TestArticleScraper(unittest.TestCase): | |
def setUp(self): | |
self.scraper = ArticleScraper() | |
def test_process_element_formatting(self): | |
"""Test that _process_element preserves various HTML formatting.""" | |
# Test complex nested HTML with multiple formatting elements | |
html = """ | |
<div> | |
<h1>Main Title</h1> | |
<p>This is a <strong>bold</strong> and <em>italic</em> text.</p> | |
<p>This is a <a href="https://example.com">link</a> in a paragraph.</p> | |
<ul> | |
<li>First <strong>important</strong> item</li> | |
<li>Second item with <em>emphasis</em></li> | |
</ul> | |
<ol> | |
<li>Numbered item <a href="test.com">with link</a></li> | |
<li>Another numbered item</li> | |
</ol> | |
<div> | |
Nested <br/>content with<br />line breaks | |
</div> | |
</div> | |
""" | |
soup = BeautifulSoup(html, 'html.parser') | |
formatted_content = self.scraper._process_element(soup.div) | |
expected_output = """ | |
## Main Title | |
This is a **bold** and _italic_ text. | |
This is a [link](https://example.com) in a paragraph. | |
• First **important** item | |
• Second item with _emphasis_ | |
1. Numbered item [with link](test.com) | |
2. Another numbered item | |
Nested | |
content with | |
line breaks""".strip() | |
# Normalize whitespace for comparison | |
formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip()) | |
expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip()) | |
self.assertEqual(formatted_content, expected_output) | |
def test_extract_snopes_article(self): | |
"""Test extraction of a Snopes-style article with formatting.""" | |
html = """ | |
<html> | |
<body> | |
<header> | |
<h1>Fact Check: Test Claim</h1> | |
</header> | |
<article> | |
<h2>The Claim</h2> | |
<p>This is the <strong>main claim</strong> being tested.</p> | |
<h2>The Facts</h2> | |
<ul> | |
<li>First important fact with <em>emphasis</em></li> | |
<li>Second fact with a <a href="source.com">source</a></li> | |
</ul> | |
<p>Additional <strong>important</strong> context.</p> | |
</article> | |
</body> | |
</html> | |
""" | |
soup = BeautifulSoup(html, 'html.parser') | |
result = self.scraper._extract_article(soup, 'snopes.com') | |
expected_content = """ | |
## The Claim | |
This is the **main claim** being tested. | |
## The Facts | |
• First important fact with _emphasis_ | |
• Second fact with a [source](source.com) | |
Additional **important** context.""".strip() | |
self.assertEqual(result['headline'], 'Fact Check: Test Claim') | |
# Normalize whitespace for comparison | |
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
self.assertEqual(actual_content, expected_content) | |
def test_extract_politifact_article(self): | |
"""Test extraction of a PolitiFact-style article with formatting.""" | |
html = """ | |
<html> | |
<body> | |
<h1 class="article__title">Test Political Claim</h1> | |
<article class="article"> | |
<div class="article__text"> | |
<p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p> | |
<h3>Our Analysis</h3> | |
<ul> | |
<li>Evidence point 1</li> | |
<li>Evidence point 2 with <a href="proof.com">proof</a></li> | |
</ul> | |
<p>Final assessment with <strong>key points</strong>.</p> | |
</div> | |
</article> | |
</body> | |
</html> | |
""" | |
soup = BeautifulSoup(html, 'html.parser') | |
result = self.scraper._extract_article(soup, 'politifact.com') | |
expected_content = """ | |
Here's a claim with **bold text** and _italics_. | |
### Our Analysis | |
• Evidence point 1 | |
• Evidence point 2 with [proof](proof.com) | |
Final assessment with **key points**.""".strip() | |
self.assertEqual(result['headline'], 'Test Political Claim') | |
# Normalize whitespace for comparison | |
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
self.assertEqual(actual_content, expected_content) | |
def test_extract_generic_article(self): | |
"""Test extraction of a generic article with formatting.""" | |
html = """ | |
<html> | |
<body> | |
<h1>Generic Article Title</h1> | |
<main> | |
<p>Opening paragraph with <strong>bold</strong> text.</p> | |
<div class="content"> | |
<h2>Section Title</h2> | |
<p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p> | |
<ul> | |
<li>Point <strong>one</strong></li> | |
<li>Point <em>two</em></li> | |
</ul> | |
</div> | |
</main> | |
</body> | |
</html> | |
""" | |
soup = BeautifulSoup(html, 'html.parser') | |
result = self.scraper._extract_article(soup, 'generic.com') | |
expected_content = """ | |
Opening paragraph with **bold** text. | |
## Section Title | |
Content with _italic_ text and [reference](ref.com). | |
• Point **one** | |
• Point _two_""".strip() | |
self.assertEqual(result['headline'], 'Generic Article Title') | |
# Normalize whitespace for comparison | |
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
self.assertEqual(actual_content, expected_content) | |
if __name__ == '__main__': | |
unittest.main() |