import unittest from bs4 import BeautifulSoup from mediaunmasked.scrapers.article_scraper import ArticleScraper class TestArticleScraper(unittest.TestCase): def setUp(self): self.scraper = ArticleScraper() def test_process_element_formatting(self): """Test that _process_element preserves various HTML formatting.""" # Test complex nested HTML with multiple formatting elements html = """

Main Title

This is a bold and italic text.

This is a link in a paragraph.

  1. Numbered item with link
  2. Another numbered item
Nested
content with
line breaks
""" soup = BeautifulSoup(html, 'html.parser') formatted_content = self.scraper._process_element(soup.div) expected_output = """ ## Main Title This is a **bold** and _italic_ text. This is a [link](https://example.com) in a paragraph. • First **important** item • Second item with _emphasis_ 1. Numbered item [with link](test.com) 2. Another numbered item Nested content with line breaks""".strip() # Normalize whitespace for comparison formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip()) expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip()) self.assertEqual(formatted_content, expected_output) def test_extract_snopes_article(self): """Test extraction of a Snopes-style article with formatting.""" html = """

Fact Check: Test Claim

The Claim

This is the main claim being tested.

The Facts

Additional important context.

""" soup = BeautifulSoup(html, 'html.parser') result = self.scraper._extract_article(soup, 'snopes.com') expected_content = """ ## The Claim This is the **main claim** being tested. ## The Facts • First important fact with _emphasis_ • Second fact with a [source](source.com) Additional **important** context.""".strip() self.assertEqual(result['headline'], 'Fact Check: Test Claim') # Normalize whitespace for comparison actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) self.assertEqual(actual_content, expected_content) def test_extract_politifact_article(self): """Test extraction of a PolitiFact-style article with formatting.""" html = """

Test Political Claim

Here's a claim with bold text and italics.

Our Analysis

Final assessment with key points.

""" soup = BeautifulSoup(html, 'html.parser') result = self.scraper._extract_article(soup, 'politifact.com') expected_content = """ Here's a claim with **bold text** and _italics_. ### Our Analysis • Evidence point 1 • Evidence point 2 with [proof](proof.com) Final assessment with **key points**.""".strip() self.assertEqual(result['headline'], 'Test Political Claim') # Normalize whitespace for comparison actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) self.assertEqual(actual_content, expected_content) def test_extract_generic_article(self): """Test extraction of a generic article with formatting.""" html = """

Generic Article Title

Opening paragraph with bold text.

Section Title

Content with italic text and reference.

""" soup = BeautifulSoup(html, 'html.parser') result = self.scraper._extract_article(soup, 'generic.com') expected_content = """ Opening paragraph with **bold** text. ## Section Title Content with _italic_ text and [reference](ref.com). • Point **one** • Point _two_""".strip() self.assertEqual(result['headline'], 'Generic Article Title') # Normalize whitespace for comparison actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) self.assertEqual(actual_content, expected_content) if __name__ == '__main__': unittest.main()