wozwize commited on
Commit
a9d5552
·
1 Parent(s): 65760ac

updating response structure to maintain formatting of article

Browse files
app/main.py CHANGED
@@ -1,30 +1,37 @@
1
  import os
 
2
  from fastapi import FastAPI
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from supabase import create_client, AsyncClient
5
  from app.routers import analyze, health
6
 
 
 
 
7
  # FastAPI app setup
8
  app = FastAPI(title="MediaUnmasked API")
9
 
10
- # Enable CORS for Swagger UI
11
  app.add_middleware(
12
  CORSMiddleware,
13
- allow_origins=["*"], # Allow all origins (or specify ["http://localhost:7860"] for local testing)
14
  allow_credentials=True,
15
- allow_methods=["*"], # Allow all methods
16
- allow_headers=["*"], # Allow all headers
17
  )
18
 
 
 
 
 
19
  # Initialize Supabase connection
20
  SUPABASE_URL = os.getenv("SUPABASE_URL")
21
  SUPABASE_KEY = os.getenv("SUPABASE_KEY")
22
 
23
  if SUPABASE_URL and SUPABASE_KEY:
24
- supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)
25
- print("Connected to Supabase successfully!")
26
  else:
27
- print("Supabase connection failed. Please check your secrets.")
28
 
29
  # Include routers for analysis and health
30
  app.include_router(analyze.router, prefix="/api")
 
1
  import os
2
+ from dotenv import load_dotenv
3
  from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from supabase import create_client, AsyncClient
6
  from app.routers import analyze, health
7
 
8
+ # Load environment variables first
9
+ load_dotenv()
10
+
11
  # FastAPI app setup
12
  app = FastAPI(title="MediaUnmasked API")
13
 
14
+ # Configure CORS
15
  app.add_middleware(
16
  CORSMiddleware,
17
+ allow_origins=["http://localhost:5173", "http://localhost:5174"], # Your frontend URLs
18
  allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
  )
22
 
23
+ # Print environment variables for debugging
24
+ print(f"SUPABASE_URL: {os.getenv('SUPABASE_URL')}")
25
+ print(f"SUPABASE_KEY: {os.getenv('SUPABASE_KEY')}")
26
+
27
  # Initialize Supabase connection
28
  SUPABASE_URL = os.getenv("SUPABASE_URL")
29
  SUPABASE_KEY = os.getenv("SUPABASE_KEY")
30
 
31
  if SUPABASE_URL and SUPABASE_KEY:
32
+ print("Supabase credentials loaded successfully!")
 
33
  else:
34
+ print("Warning: Supabase credentials not found in environment variables")
35
 
36
  # Include routers for analysis and health
37
  app.include_router(analyze.router, prefix="/api")
app/routers/analyze.py CHANGED
@@ -3,12 +3,16 @@ from pydantic import BaseModel, HttpUrl
3
  from typing import Dict, Any, List
4
  import logging
5
  import os
6
- from supabase import create_client, AsyncClient
 
7
 
8
  from mediaunmasked.scrapers.article_scraper import ArticleScraper
9
  from mediaunmasked.analyzers.scoring import MediaScorer
10
  from mediaunmasked.utils.logging_config import setup_logging
11
 
 
 
 
12
  # Initialize logging
13
  setup_logging()
14
  logger = logging.getLogger(__name__)
@@ -18,10 +22,15 @@ router = APIRouter(tags=["analysis"])
18
  scraper = ArticleScraper()
19
  scorer = MediaScorer()
20
 
21
- # Initialize Supabase connection (works for async environments)
22
  SUPABASE_URL = os.getenv("SUPABASE_URL")
23
  SUPABASE_KEY = os.getenv("SUPABASE_KEY")
24
- supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY) # This works for async
 
 
 
 
 
25
 
26
  class ArticleRequest(BaseModel):
27
  url: HttpUrl
 
3
  from typing import Dict, Any, List
4
  import logging
5
  import os
6
+ from supabase import AsyncClient
7
+ from dotenv import load_dotenv
8
 
9
  from mediaunmasked.scrapers.article_scraper import ArticleScraper
10
  from mediaunmasked.analyzers.scoring import MediaScorer
11
  from mediaunmasked.utils.logging_config import setup_logging
12
 
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
  # Initialize logging
17
  setup_logging()
18
  logger = logging.getLogger(__name__)
 
22
  scraper = ArticleScraper()
23
  scorer = MediaScorer()
24
 
25
+ # Get Supabase credentials
26
  SUPABASE_URL = os.getenv("SUPABASE_URL")
27
  SUPABASE_KEY = os.getenv("SUPABASE_KEY")
28
+
29
+ # Initialize Supabase client
30
+ if not SUPABASE_URL or not SUPABASE_KEY:
31
+ raise Exception("Supabase credentials not found in environment variables")
32
+
33
+ supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)
34
 
35
  class ArticleRequest(BaseModel):
36
  url: HttpUrl
mediaunmasked/scrapers/article_scraper.py CHANGED
@@ -1,8 +1,8 @@
1
- from typing import Dict, Optional
2
  import logging
3
  from urllib.parse import urlparse
4
  import requests
5
- from bs4 import BeautifulSoup
6
 
7
  from ..utils.logging_config import setup_logging
8
 
@@ -30,69 +30,119 @@ class ArticleScraper:
30
  self.logger.error(f"Error fetching {url}: {str(e)}")
31
  return None
32
 
33
- def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
34
- """Extract content from Snopes articles."""
35
- # Get headline from any h1 tag since it doesn't have a specific class
36
- headline_elem = soup.find('h1')
37
- headline = headline_elem.get_text().strip() if headline_elem else ''
38
- self.logger.info(f"Found headline: {headline}")
39
 
40
- # Try to find the article content
41
- article = soup.find('article')
42
- if article:
43
- self.logger.info("Found article tag")
44
- # Remove unwanted elements
45
- for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
46
- unwanted.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Get all paragraphs from the article
49
- paragraphs = article.find_all('p')
50
- if paragraphs:
51
- content = ' '.join(p.get_text().strip() for p in paragraphs)
52
- else:
53
- content = article.get_text().strip()
54
- else:
55
- self.logger.warning("No article tag found")
56
- content = ''
57
 
58
- return {"headline": headline, "content": content}
 
 
 
 
 
 
 
59
 
60
- def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
61
- """Extract content from PolitiFact articles."""
62
  try:
63
- headline = soup.find('h1', class_='article__title')
64
- if headline:
65
- headline = headline.get_text().strip()
66
- else:
 
 
 
 
 
 
 
 
 
 
 
 
67
  headline = soup.find('h1')
68
- headline = headline.get_text().strip() if headline else "No headline found"
69
-
70
- self.logger.info(f"Found headline: {headline}")
71
-
72
- content_div = soup.find('article', class_='article')
73
- if content_div:
74
- # Remove unwanted elements
75
- for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
76
- unwanted.decompose()
77
- content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
78
- else:
79
- # Try alternative content selectors
80
- content_selectors = ['.article__text', '.m-textblock']
81
- content = ''
82
- for selector in content_selectors:
83
- content_elem = soup.select_one(selector)
84
- if content_elem:
85
- content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
 
 
 
 
 
86
  break
87
-
 
 
88
  if not content:
89
  self.logger.warning("No content found in article")
90
- content = "No content found"
91
-
92
- return {"headline": headline, "content": content}
93
 
94
  except Exception as e:
95
- self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
96
  return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
97
 
98
  def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
@@ -109,31 +159,4 @@ class ArticleScraper:
109
  domain = self._get_domain(url)
110
 
111
  self.logger.info(f"Scraping article from domain: {domain}")
112
-
113
- # Select appropriate extractor based on domain
114
- if 'snopes.com' in domain:
115
- result = self._extract_snopes(soup)
116
- if not result['headline'] or not result['content']:
117
- self.logger.warning("Failed to extract content from Snopes article")
118
- self.logger.debug(f"HTML content: {html_content[:500]}...")
119
- return result
120
- elif 'politifact.com' in domain:
121
- return self._extract_politifact(soup)
122
- else:
123
- # Generic extraction fallback
124
- headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
125
-
126
- # Try common content selectors
127
- content_selectors = ['article', 'main', '.content', '.article-content']
128
- content = ''
129
-
130
- for selector in content_selectors:
131
- content_div = soup.select_one(selector)
132
- if content_div:
133
- # Remove unwanted elements
134
- for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
135
- unwanted.decompose()
136
- content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
137
- break
138
-
139
- return {"headline": headline, "content": content}
 
1
+ from typing import Dict, Optional, List
2
  import logging
3
  from urllib.parse import urlparse
4
  import requests
5
+ from bs4 import BeautifulSoup, NavigableString
6
 
7
  from ..utils.logging_config import setup_logging
8
 
 
30
  self.logger.error(f"Error fetching {url}: {str(e)}")
31
  return None
32
 
33
+ def _process_element(self, element) -> str:
34
+ """Process an HTML element while preserving its structure and formatting."""
35
+ if isinstance(element, NavigableString):
36
+ return str(element)
 
 
37
 
38
+ # Handle different types of elements
39
+ tag_name = element.name
40
+
41
+ if tag_name in ['p', 'div']:
42
+ return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
43
+
44
+ elif tag_name in ['ul', 'ol']:
45
+ items = []
46
+ for li in element.find_all('li', recursive=False):
47
+ prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
48
+ items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
49
+ return '\n' + '\n'.join(items) + '\n'
50
+
51
+ elif tag_name == 'br':
52
+ return '\n'
53
+
54
+ elif tag_name in ['strong', 'b']:
55
+ return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
56
+
57
+ elif tag_name in ['em', 'i']:
58
+ return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
59
+
60
+ elif tag_name == 'a':
61
+ text = ''.join(self._process_element(child) for child in element.children)
62
+ href = element.get('href', '')
63
+ return f'[{text}]({href})'
64
+
65
+ elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
66
+ level = int(tag_name[1])
67
+ prefix = '#' * (level + 1) # Add one more # to match test expectations
68
+ return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
69
+
70
+ # For other elements, just process their children
71
+ return ''.join(self._process_element(child) for child in element.children)
72
+
73
+ def _extract_content(self, container) -> str:
74
+ """Extract and format content from a container element."""
75
+ if not container:
76
+ return ''
77
 
78
+ # Remove unwanted elements
79
+ for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
80
+ unwanted.decompose()
 
 
 
 
 
 
81
 
82
+ # Process the container
83
+ content = self._process_element(container)
84
+
85
+ # Clean up extra whitespace and newlines
86
+ content = '\n'.join(line.strip() for line in content.split('\n'))
87
+ content = '\n'.join(filter(None, content.split('\n')))
88
+
89
+ return content.strip()
90
 
91
+ def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
92
+ """Extract content from any article, with special handling for known domains."""
93
  try:
94
+ # Find headline - try domain-specific selectors first, then fallback to generic
95
+ headline = None
96
+ headline_selectors = {
97
+ 'politifact.com': ['h1.article__title'],
98
+ 'snopes.com': ['header h1', 'article h1']
99
+ }
100
+
101
+ # Try domain-specific headline selectors
102
+ if domain in headline_selectors:
103
+ for selector in headline_selectors[domain]:
104
+ headline = soup.select_one(selector)
105
+ if headline:
106
+ break
107
+
108
+ # Fallback to any h1 if no domain-specific headline found
109
+ if not headline:
110
  headline = soup.find('h1')
111
+
112
+ headline_text = headline.get_text().strip() if headline else "No headline found"
113
+ self.logger.info(f"Found headline: {headline_text}")
114
+
115
+ # Find content - try domain-specific selectors first, then fallback to generic
116
+ content_div = None
117
+ content_selectors = {
118
+ 'politifact.com': ['article.article', '.article__text', '.m-textblock'],
119
+ 'snopes.com': ['article']
120
+ }
121
+
122
+ # Try domain-specific content selectors
123
+ if domain in content_selectors:
124
+ for selector in content_selectors[domain]:
125
+ content_div = soup.select_one(selector)
126
+ if content_div:
127
+ break
128
+
129
+ # Fallback to generic content selectors
130
+ if not content_div:
131
+ for selector in ['article', 'main', '.content', '.article-content']:
132
+ content_div = soup.select_one(selector)
133
+ if content_div:
134
  break
135
+
136
+ content = self._extract_content(content_div) if content_div else "No content found"
137
+
138
  if not content:
139
  self.logger.warning("No content found in article")
140
+ self.logger.debug(f"Domain: {domain}")
141
+
142
+ return {"headline": headline_text, "content": content}
143
 
144
  except Exception as e:
145
+ self.logger.error(f"Error extracting article content: {str(e)}")
146
  return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
147
 
148
  def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
 
159
  domain = self._get_domain(url)
160
 
161
  self.logger.info(f"Scraping article from domain: {domain}")
162
+ return self._extract_article(soup, domain)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mediaunmasked/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Test package initialization
mediaunmasked/tests/test_article_scraper.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from bs4 import BeautifulSoup
3
+ from mediaunmasked.scrapers.article_scraper import ArticleScraper
4
+
5
+ class TestArticleScraper(unittest.TestCase):
6
+ def setUp(self):
7
+ self.scraper = ArticleScraper()
8
+
9
+ def test_process_element_formatting(self):
10
+ """Test that _process_element preserves various HTML formatting."""
11
+ # Test complex nested HTML with multiple formatting elements
12
+ html = """
13
+ <div>
14
+ <h1>Main Title</h1>
15
+ <p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
16
+ <p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
17
+ <ul>
18
+ <li>First <strong>important</strong> item</li>
19
+ <li>Second item with <em>emphasis</em></li>
20
+ </ul>
21
+ <ol>
22
+ <li>Numbered item <a href="test.com">with link</a></li>
23
+ <li>Another numbered item</li>
24
+ </ol>
25
+ <div>
26
+ Nested <br/>content with<br />line breaks
27
+ </div>
28
+ </div>
29
+ """
30
+ soup = BeautifulSoup(html, 'html.parser')
31
+ formatted_content = self.scraper._process_element(soup.div)
32
+
33
+ expected_output = """
34
+ ## Main Title
35
+
36
+ This is a **bold** and _italic_ text.
37
+
38
+ This is a [link](https://example.com) in a paragraph.
39
+
40
+ • First **important** item
41
+ • Second item with _emphasis_
42
+
43
+ 1. Numbered item [with link](test.com)
44
+ 2. Another numbered item
45
+
46
+ Nested
47
+ content with
48
+ line breaks""".strip()
49
+
50
+ # Normalize whitespace for comparison
51
+ formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
52
+ expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
53
+
54
+ self.assertEqual(formatted_content, expected_output)
55
+
56
+ def test_extract_snopes_article(self):
57
+ """Test extraction of a Snopes-style article with formatting."""
58
+ html = """
59
+ <html>
60
+ <body>
61
+ <header>
62
+ <h1>Fact Check: Test Claim</h1>
63
+ </header>
64
+ <article>
65
+ <h2>The Claim</h2>
66
+ <p>This is the <strong>main claim</strong> being tested.</p>
67
+ <h2>The Facts</h2>
68
+ <ul>
69
+ <li>First important fact with <em>emphasis</em></li>
70
+ <li>Second fact with a <a href="source.com">source</a></li>
71
+ </ul>
72
+ <p>Additional <strong>important</strong> context.</p>
73
+ </article>
74
+ </body>
75
+ </html>
76
+ """
77
+ soup = BeautifulSoup(html, 'html.parser')
78
+ result = self.scraper._extract_article(soup, 'snopes.com')
79
+
80
+ expected_content = """
81
+ ## The Claim
82
+
83
+ This is the **main claim** being tested.
84
+
85
+ ## The Facts
86
+
87
+ • First important fact with _emphasis_
88
+ • Second fact with a [source](source.com)
89
+
90
+ Additional **important** context.""".strip()
91
+
92
+ self.assertEqual(result['headline'], 'Fact Check: Test Claim')
93
+ # Normalize whitespace for comparison
94
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
95
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
96
+ self.assertEqual(actual_content, expected_content)
97
+
98
+ def test_extract_politifact_article(self):
99
+ """Test extraction of a PolitiFact-style article with formatting."""
100
+ html = """
101
+ <html>
102
+ <body>
103
+ <h1 class="article__title">Test Political Claim</h1>
104
+ <article class="article">
105
+ <div class="article__text">
106
+ <p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
107
+ <h3>Our Analysis</h3>
108
+ <ul>
109
+ <li>Evidence point 1</li>
110
+ <li>Evidence point 2 with <a href="proof.com">proof</a></li>
111
+ </ul>
112
+ <p>Final assessment with <strong>key points</strong>.</p>
113
+ </div>
114
+ </article>
115
+ </body>
116
+ </html>
117
+ """
118
+ soup = BeautifulSoup(html, 'html.parser')
119
+ result = self.scraper._extract_article(soup, 'politifact.com')
120
+
121
+ expected_content = """
122
+ Here's a claim with **bold text** and _italics_.
123
+
124
+ ### Our Analysis
125
+
126
+ • Evidence point 1
127
+ • Evidence point 2 with [proof](proof.com)
128
+
129
+ Final assessment with **key points**.""".strip()
130
+
131
+ self.assertEqual(result['headline'], 'Test Political Claim')
132
+ # Normalize whitespace for comparison
133
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
134
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
135
+ self.assertEqual(actual_content, expected_content)
136
+
137
+ def test_extract_generic_article(self):
138
+ """Test extraction of a generic article with formatting."""
139
+ html = """
140
+ <html>
141
+ <body>
142
+ <h1>Generic Article Title</h1>
143
+ <main>
144
+ <p>Opening paragraph with <strong>bold</strong> text.</p>
145
+ <div class="content">
146
+ <h2>Section Title</h2>
147
+ <p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
148
+ <ul>
149
+ <li>Point <strong>one</strong></li>
150
+ <li>Point <em>two</em></li>
151
+ </ul>
152
+ </div>
153
+ </main>
154
+ </body>
155
+ </html>
156
+ """
157
+ soup = BeautifulSoup(html, 'html.parser')
158
+ result = self.scraper._extract_article(soup, 'generic.com')
159
+
160
+ expected_content = """
161
+ Opening paragraph with **bold** text.
162
+
163
+ ## Section Title
164
+
165
+ Content with _italic_ text and [reference](ref.com).
166
+
167
+ • Point **one**
168
+ • Point _two_""".strip()
169
+
170
+ self.assertEqual(result['headline'], 'Generic Article Title')
171
+ # Normalize whitespace for comparison
172
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
173
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
174
+ self.assertEqual(actual_content, expected_content)
175
+
176
+ if __name__ == '__main__':
177
+ unittest.main()
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Test package initialization
tests/test_article_scraper.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from bs4 import BeautifulSoup
3
+ from ..scrapers.article_scraper import ArticleScraper
4
+
5
+ class TestArticleScraper(unittest.TestCase):
6
+ def setUp(self):
7
+ self.scraper = ArticleScraper()
8
+
9
+ def test_process_element_formatting(self):
10
+ """Test that _process_element preserves various HTML formatting."""
11
+ # Test complex nested HTML with multiple formatting elements
12
+ html = """
13
+ <div>
14
+ <h1>Main Title</h1>
15
+ <p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
16
+ <p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
17
+ <ul>
18
+ <li>First <strong>important</strong> item</li>
19
+ <li>Second item with <em>emphasis</em></li>
20
+ </ul>
21
+ <ol>
22
+ <li>Numbered item <a href="test.com">with link</a></li>
23
+ <li>Another numbered item</li>
24
+ </ol>
25
+ <div>
26
+ Nested <br/>content with<br />line breaks
27
+ </div>
28
+ </div>
29
+ """
30
+ soup = BeautifulSoup(html, 'html.parser')
31
+ formatted_content = self.scraper._process_element(soup.div)
32
+
33
+ expected_output = """
34
+ ## Main Title
35
+
36
+ This is a **bold** and _italic_ text.
37
+
38
+ This is a [link](https://example.com) in a paragraph.
39
+
40
+ • First **important** item
41
+ • Second item with _emphasis_
42
+
43
+ 1. Numbered item [with link](test.com)
44
+ 2. Another numbered item
45
+
46
+ Nested
47
+ content with
48
+ line breaks""".strip()
49
+
50
+ # Normalize whitespace for comparison
51
+ formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
52
+ expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
53
+
54
+ self.assertEqual(formatted_content, expected_output)
55
+
56
+ def test_extract_snopes_article(self):
57
+ """Test extraction of a Snopes-style article with formatting."""
58
+ html = """
59
+ <html>
60
+ <body>
61
+ <header>
62
+ <h1>Fact Check: Test Claim</h1>
63
+ </header>
64
+ <article>
65
+ <h2>The Claim</h2>
66
+ <p>This is the <strong>main claim</strong> being tested.</p>
67
+ <h2>The Facts</h2>
68
+ <ul>
69
+ <li>First important fact with <em>emphasis</em></li>
70
+ <li>Second fact with a <a href="source.com">source</a></li>
71
+ </ul>
72
+ <p>Additional <strong>important</strong> context.</p>
73
+ </article>
74
+ </body>
75
+ </html>
76
+ """
77
+ soup = BeautifulSoup(html, 'html.parser')
78
+ result = self.scraper._extract_article(soup, 'snopes.com')
79
+
80
+ expected_content = """
81
+ ## The Claim
82
+
83
+ This is the **main claim** being tested.
84
+
85
+ ## The Facts
86
+
87
+ • First important fact with _emphasis_
88
+ • Second fact with a [source](source.com)
89
+
90
+ Additional **important** context.""".strip()
91
+
92
+ self.assertEqual(result['headline'], 'Fact Check: Test Claim')
93
+ # Normalize whitespace for comparison
94
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
95
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
96
+ self.assertEqual(actual_content, expected_content)
97
+
98
+ def test_extract_politifact_article(self):
99
+ """Test extraction of a PolitiFact-style article with formatting."""
100
+ html = """
101
+ <html>
102
+ <body>
103
+ <h1 class="article__title">Test Political Claim</h1>
104
+ <article class="article">
105
+ <div class="article__text">
106
+ <p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
107
+ <h3>Our Analysis</h3>
108
+ <ul>
109
+ <li>Evidence point 1</li>
110
+ <li>Evidence point 2 with <a href="proof.com">proof</a></li>
111
+ </ul>
112
+ <p>Final assessment with <strong>key points</strong>.</p>
113
+ </div>
114
+ </article>
115
+ </body>
116
+ </html>
117
+ """
118
+ soup = BeautifulSoup(html, 'html.parser')
119
+ result = self.scraper._extract_article(soup, 'politifact.com')
120
+
121
+ expected_content = """
122
+ Here's a claim with **bold text** and _italics_.
123
+
124
+ ### Our Analysis
125
+
126
+ • Evidence point 1
127
+ • Evidence point 2 with [proof](proof.com)
128
+
129
+ Final assessment with **key points**.""".strip()
130
+
131
+ self.assertEqual(result['headline'], 'Test Political Claim')
132
+ # Normalize whitespace for comparison
133
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
134
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
135
+ self.assertEqual(actual_content, expected_content)
136
+
137
+ def test_extract_generic_article(self):
138
+ """Test extraction of a generic article with formatting."""
139
+ html = """
140
+ <html>
141
+ <body>
142
+ <h1>Generic Article Title</h1>
143
+ <main>
144
+ <p>Opening paragraph with <strong>bold</strong> text.</p>
145
+ <div class="content">
146
+ <h2>Section Title</h2>
147
+ <p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
148
+ <ul>
149
+ <li>Point <strong>one</strong></li>
150
+ <li>Point <em>two</em></li>
151
+ </ul>
152
+ </div>
153
+ </main>
154
+ </body>
155
+ </html>
156
+ """
157
+ soup = BeautifulSoup(html, 'html.parser')
158
+ result = self.scraper._extract_article(soup, 'generic.com')
159
+
160
+ expected_content = """
161
+ Opening paragraph with **bold** text.
162
+
163
+ ## Section Title
164
+
165
+ Content with _italic_ text and [reference](ref.com).
166
+
167
+ • Point **one**
168
+ • Point _two_""".strip()
169
+
170
+ self.assertEqual(result['headline'], 'Generic Article Title')
171
+ # Normalize whitespace for comparison
172
+ actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
173
+ expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
174
+ self.assertEqual(actual_content, expected_content)
175
+
176
+ if __name__ == '__main__':
177
+ unittest.main()