Spaces:
Running
Running
updating response structure to maintain formatting of article
Browse files- app/main.py +14 -7
- app/routers/analyze.py +12 -3
- mediaunmasked/scrapers/article_scraper.py +105 -82
- mediaunmasked/tests/__init__.py +1 -0
- mediaunmasked/tests/test_article_scraper.py +177 -0
- tests/__init__.py +1 -0
- tests/test_article_scraper.py +177 -0
app/main.py
CHANGED
@@ -1,30 +1,37 @@
|
|
1 |
import os
|
|
|
2 |
from fastapi import FastAPI
|
3 |
from fastapi.middleware.cors import CORSMiddleware
|
4 |
from supabase import create_client, AsyncClient
|
5 |
from app.routers import analyze, health
|
6 |
|
|
|
|
|
|
|
7 |
# FastAPI app setup
|
8 |
app = FastAPI(title="MediaUnmasked API")
|
9 |
|
10 |
-
#
|
11 |
app.add_middleware(
|
12 |
CORSMiddleware,
|
13 |
-
allow_origins=["
|
14 |
allow_credentials=True,
|
15 |
-
allow_methods=["*"],
|
16 |
-
allow_headers=["*"],
|
17 |
)
|
18 |
|
|
|
|
|
|
|
|
|
19 |
# Initialize Supabase connection
|
20 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
21 |
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
22 |
|
23 |
if SUPABASE_URL and SUPABASE_KEY:
|
24 |
-
|
25 |
-
print("Connected to Supabase successfully!")
|
26 |
else:
|
27 |
-
print("Supabase
|
28 |
|
29 |
# Include routers for analysis and health
|
30 |
app.include_router(analyze.router, prefix="/api")
|
|
|
1 |
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
from fastapi import FastAPI
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from supabase import create_client, AsyncClient
|
6 |
from app.routers import analyze, health
|
7 |
|
8 |
+
# Load environment variables first
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
# FastAPI app setup
|
12 |
app = FastAPI(title="MediaUnmasked API")
|
13 |
|
14 |
+
# Configure CORS
|
15 |
app.add_middleware(
|
16 |
CORSMiddleware,
|
17 |
+
allow_origins=["http://localhost:5173", "http://localhost:5174"], # Your frontend URLs
|
18 |
allow_credentials=True,
|
19 |
+
allow_methods=["*"],
|
20 |
+
allow_headers=["*"],
|
21 |
)
|
22 |
|
23 |
+
# Print environment variables for debugging
|
24 |
+
print(f"SUPABASE_URL: {os.getenv('SUPABASE_URL')}")
|
25 |
+
print(f"SUPABASE_KEY: {os.getenv('SUPABASE_KEY')}")
|
26 |
+
|
27 |
# Initialize Supabase connection
|
28 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
29 |
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
30 |
|
31 |
if SUPABASE_URL and SUPABASE_KEY:
|
32 |
+
print("Supabase credentials loaded successfully!")
|
|
|
33 |
else:
|
34 |
+
print("Warning: Supabase credentials not found in environment variables")
|
35 |
|
36 |
# Include routers for analysis and health
|
37 |
app.include_router(analyze.router, prefix="/api")
|
app/routers/analyze.py
CHANGED
@@ -3,12 +3,16 @@ from pydantic import BaseModel, HttpUrl
|
|
3 |
from typing import Dict, Any, List
|
4 |
import logging
|
5 |
import os
|
6 |
-
from supabase import
|
|
|
7 |
|
8 |
from mediaunmasked.scrapers.article_scraper import ArticleScraper
|
9 |
from mediaunmasked.analyzers.scoring import MediaScorer
|
10 |
from mediaunmasked.utils.logging_config import setup_logging
|
11 |
|
|
|
|
|
|
|
12 |
# Initialize logging
|
13 |
setup_logging()
|
14 |
logger = logging.getLogger(__name__)
|
@@ -18,10 +22,15 @@ router = APIRouter(tags=["analysis"])
|
|
18 |
scraper = ArticleScraper()
|
19 |
scorer = MediaScorer()
|
20 |
|
21 |
-
#
|
22 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
23 |
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
class ArticleRequest(BaseModel):
|
27 |
url: HttpUrl
|
|
|
3 |
from typing import Dict, Any, List
|
4 |
import logging
|
5 |
import os
|
6 |
+
from supabase import AsyncClient
|
7 |
+
from dotenv import load_dotenv
|
8 |
|
9 |
from mediaunmasked.scrapers.article_scraper import ArticleScraper
|
10 |
from mediaunmasked.analyzers.scoring import MediaScorer
|
11 |
from mediaunmasked.utils.logging_config import setup_logging
|
12 |
|
13 |
+
# Load environment variables
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
# Initialize logging
|
17 |
setup_logging()
|
18 |
logger = logging.getLogger(__name__)
|
|
|
22 |
scraper = ArticleScraper()
|
23 |
scorer = MediaScorer()
|
24 |
|
25 |
+
# Get Supabase credentials
|
26 |
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
27 |
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
28 |
+
|
29 |
+
# Initialize Supabase client
|
30 |
+
if not SUPABASE_URL or not SUPABASE_KEY:
|
31 |
+
raise Exception("Supabase credentials not found in environment variables")
|
32 |
+
|
33 |
+
supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)
|
34 |
|
35 |
class ArticleRequest(BaseModel):
|
36 |
url: HttpUrl
|
mediaunmasked/scrapers/article_scraper.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
from typing import Dict, Optional
|
2 |
import logging
|
3 |
from urllib.parse import urlparse
|
4 |
import requests
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
|
7 |
from ..utils.logging_config import setup_logging
|
8 |
|
@@ -30,69 +30,119 @@ class ArticleScraper:
|
|
30 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
31 |
return None
|
32 |
|
33 |
-
def
|
34 |
-
"""
|
35 |
-
|
36 |
-
|
37 |
-
headline = headline_elem.get_text().strip() if headline_elem else ''
|
38 |
-
self.logger.info(f"Found headline: {headline}")
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
content = ' '.join(p.get_text().strip() for p in paragraphs)
|
52 |
-
else:
|
53 |
-
content = article.get_text().strip()
|
54 |
-
else:
|
55 |
-
self.logger.warning("No article tag found")
|
56 |
-
content = ''
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
def
|
61 |
-
"""Extract content from
|
62 |
try:
|
63 |
-
headline
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
headline = soup.find('h1')
|
68 |
-
|
69 |
-
|
70 |
-
self.logger.info(f"Found headline: {
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
86 |
break
|
87 |
-
|
|
|
|
|
88 |
if not content:
|
89 |
self.logger.warning("No content found in article")
|
90 |
-
|
91 |
-
|
92 |
-
return {"headline":
|
93 |
|
94 |
except Exception as e:
|
95 |
-
self.logger.error(f"Error extracting
|
96 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
97 |
|
98 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
@@ -109,31 +159,4 @@ class ArticleScraper:
|
|
109 |
domain = self._get_domain(url)
|
110 |
|
111 |
self.logger.info(f"Scraping article from domain: {domain}")
|
112 |
-
|
113 |
-
# Select appropriate extractor based on domain
|
114 |
-
if 'snopes.com' in domain:
|
115 |
-
result = self._extract_snopes(soup)
|
116 |
-
if not result['headline'] or not result['content']:
|
117 |
-
self.logger.warning("Failed to extract content from Snopes article")
|
118 |
-
self.logger.debug(f"HTML content: {html_content[:500]}...")
|
119 |
-
return result
|
120 |
-
elif 'politifact.com' in domain:
|
121 |
-
return self._extract_politifact(soup)
|
122 |
-
else:
|
123 |
-
# Generic extraction fallback
|
124 |
-
headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
|
125 |
-
|
126 |
-
# Try common content selectors
|
127 |
-
content_selectors = ['article', 'main', '.content', '.article-content']
|
128 |
-
content = ''
|
129 |
-
|
130 |
-
for selector in content_selectors:
|
131 |
-
content_div = soup.select_one(selector)
|
132 |
-
if content_div:
|
133 |
-
# Remove unwanted elements
|
134 |
-
for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
|
135 |
-
unwanted.decompose()
|
136 |
-
content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
|
137 |
-
break
|
138 |
-
|
139 |
-
return {"headline": headline, "content": content}
|
|
|
1 |
+
from typing import Dict, Optional, List
|
2 |
import logging
|
3 |
from urllib.parse import urlparse
|
4 |
import requests
|
5 |
+
from bs4 import BeautifulSoup, NavigableString
|
6 |
|
7 |
from ..utils.logging_config import setup_logging
|
8 |
|
|
|
30 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
31 |
return None
|
32 |
|
33 |
+
def _process_element(self, element) -> str:
|
34 |
+
"""Process an HTML element while preserving its structure and formatting."""
|
35 |
+
if isinstance(element, NavigableString):
|
36 |
+
return str(element)
|
|
|
|
|
37 |
|
38 |
+
# Handle different types of elements
|
39 |
+
tag_name = element.name
|
40 |
+
|
41 |
+
if tag_name in ['p', 'div']:
|
42 |
+
return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
|
43 |
+
|
44 |
+
elif tag_name in ['ul', 'ol']:
|
45 |
+
items = []
|
46 |
+
for li in element.find_all('li', recursive=False):
|
47 |
+
prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
|
48 |
+
items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
|
49 |
+
return '\n' + '\n'.join(items) + '\n'
|
50 |
+
|
51 |
+
elif tag_name == 'br':
|
52 |
+
return '\n'
|
53 |
+
|
54 |
+
elif tag_name in ['strong', 'b']:
|
55 |
+
return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
|
56 |
+
|
57 |
+
elif tag_name in ['em', 'i']:
|
58 |
+
return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
|
59 |
+
|
60 |
+
elif tag_name == 'a':
|
61 |
+
text = ''.join(self._process_element(child) for child in element.children)
|
62 |
+
href = element.get('href', '')
|
63 |
+
return f'[{text}]({href})'
|
64 |
+
|
65 |
+
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
66 |
+
level = int(tag_name[1])
|
67 |
+
prefix = '#' * (level + 1) # Add one more # to match test expectations
|
68 |
+
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
|
69 |
+
|
70 |
+
# For other elements, just process their children
|
71 |
+
return ''.join(self._process_element(child) for child in element.children)
|
72 |
+
|
73 |
+
def _extract_content(self, container) -> str:
|
74 |
+
"""Extract and format content from a container element."""
|
75 |
+
if not container:
|
76 |
+
return ''
|
77 |
|
78 |
+
# Remove unwanted elements
|
79 |
+
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
|
80 |
+
unwanted.decompose()
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
# Process the container
|
83 |
+
content = self._process_element(container)
|
84 |
+
|
85 |
+
# Clean up extra whitespace and newlines
|
86 |
+
content = '\n'.join(line.strip() for line in content.split('\n'))
|
87 |
+
content = '\n'.join(filter(None, content.split('\n')))
|
88 |
+
|
89 |
+
return content.strip()
|
90 |
|
91 |
+
def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
|
92 |
+
"""Extract content from any article, with special handling for known domains."""
|
93 |
try:
|
94 |
+
# Find headline - try domain-specific selectors first, then fallback to generic
|
95 |
+
headline = None
|
96 |
+
headline_selectors = {
|
97 |
+
'politifact.com': ['h1.article__title'],
|
98 |
+
'snopes.com': ['header h1', 'article h1']
|
99 |
+
}
|
100 |
+
|
101 |
+
# Try domain-specific headline selectors
|
102 |
+
if domain in headline_selectors:
|
103 |
+
for selector in headline_selectors[domain]:
|
104 |
+
headline = soup.select_one(selector)
|
105 |
+
if headline:
|
106 |
+
break
|
107 |
+
|
108 |
+
# Fallback to any h1 if no domain-specific headline found
|
109 |
+
if not headline:
|
110 |
headline = soup.find('h1')
|
111 |
+
|
112 |
+
headline_text = headline.get_text().strip() if headline else "No headline found"
|
113 |
+
self.logger.info(f"Found headline: {headline_text}")
|
114 |
+
|
115 |
+
# Find content - try domain-specific selectors first, then fallback to generic
|
116 |
+
content_div = None
|
117 |
+
content_selectors = {
|
118 |
+
'politifact.com': ['article.article', '.article__text', '.m-textblock'],
|
119 |
+
'snopes.com': ['article']
|
120 |
+
}
|
121 |
+
|
122 |
+
# Try domain-specific content selectors
|
123 |
+
if domain in content_selectors:
|
124 |
+
for selector in content_selectors[domain]:
|
125 |
+
content_div = soup.select_one(selector)
|
126 |
+
if content_div:
|
127 |
+
break
|
128 |
+
|
129 |
+
# Fallback to generic content selectors
|
130 |
+
if not content_div:
|
131 |
+
for selector in ['article', 'main', '.content', '.article-content']:
|
132 |
+
content_div = soup.select_one(selector)
|
133 |
+
if content_div:
|
134 |
break
|
135 |
+
|
136 |
+
content = self._extract_content(content_div) if content_div else "No content found"
|
137 |
+
|
138 |
if not content:
|
139 |
self.logger.warning("No content found in article")
|
140 |
+
self.logger.debug(f"Domain: {domain}")
|
141 |
+
|
142 |
+
return {"headline": headline_text, "content": content}
|
143 |
|
144 |
except Exception as e:
|
145 |
+
self.logger.error(f"Error extracting article content: {str(e)}")
|
146 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
147 |
|
148 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
|
|
159 |
domain = self._get_domain(url)
|
160 |
|
161 |
self.logger.info(f"Scraping article from domain: {domain}")
|
162 |
+
return self._extract_article(soup, domain)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mediaunmasked/tests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Test package initialization
|
mediaunmasked/tests/test_article_scraper.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from mediaunmasked.scrapers.article_scraper import ArticleScraper
|
4 |
+
|
5 |
+
class TestArticleScraper(unittest.TestCase):
|
6 |
+
def setUp(self):
|
7 |
+
self.scraper = ArticleScraper()
|
8 |
+
|
9 |
+
def test_process_element_formatting(self):
|
10 |
+
"""Test that _process_element preserves various HTML formatting."""
|
11 |
+
# Test complex nested HTML with multiple formatting elements
|
12 |
+
html = """
|
13 |
+
<div>
|
14 |
+
<h1>Main Title</h1>
|
15 |
+
<p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
|
16 |
+
<p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
|
17 |
+
<ul>
|
18 |
+
<li>First <strong>important</strong> item</li>
|
19 |
+
<li>Second item with <em>emphasis</em></li>
|
20 |
+
</ul>
|
21 |
+
<ol>
|
22 |
+
<li>Numbered item <a href="test.com">with link</a></li>
|
23 |
+
<li>Another numbered item</li>
|
24 |
+
</ol>
|
25 |
+
<div>
|
26 |
+
Nested <br/>content with<br />line breaks
|
27 |
+
</div>
|
28 |
+
</div>
|
29 |
+
"""
|
30 |
+
soup = BeautifulSoup(html, 'html.parser')
|
31 |
+
formatted_content = self.scraper._process_element(soup.div)
|
32 |
+
|
33 |
+
expected_output = """
|
34 |
+
## Main Title
|
35 |
+
|
36 |
+
This is a **bold** and _italic_ text.
|
37 |
+
|
38 |
+
This is a [link](https://example.com) in a paragraph.
|
39 |
+
|
40 |
+
• First **important** item
|
41 |
+
• Second item with _emphasis_
|
42 |
+
|
43 |
+
1. Numbered item [with link](test.com)
|
44 |
+
2. Another numbered item
|
45 |
+
|
46 |
+
Nested
|
47 |
+
content with
|
48 |
+
line breaks""".strip()
|
49 |
+
|
50 |
+
# Normalize whitespace for comparison
|
51 |
+
formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
|
52 |
+
expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
|
53 |
+
|
54 |
+
self.assertEqual(formatted_content, expected_output)
|
55 |
+
|
56 |
+
def test_extract_snopes_article(self):
|
57 |
+
"""Test extraction of a Snopes-style article with formatting."""
|
58 |
+
html = """
|
59 |
+
<html>
|
60 |
+
<body>
|
61 |
+
<header>
|
62 |
+
<h1>Fact Check: Test Claim</h1>
|
63 |
+
</header>
|
64 |
+
<article>
|
65 |
+
<h2>The Claim</h2>
|
66 |
+
<p>This is the <strong>main claim</strong> being tested.</p>
|
67 |
+
<h2>The Facts</h2>
|
68 |
+
<ul>
|
69 |
+
<li>First important fact with <em>emphasis</em></li>
|
70 |
+
<li>Second fact with a <a href="source.com">source</a></li>
|
71 |
+
</ul>
|
72 |
+
<p>Additional <strong>important</strong> context.</p>
|
73 |
+
</article>
|
74 |
+
</body>
|
75 |
+
</html>
|
76 |
+
"""
|
77 |
+
soup = BeautifulSoup(html, 'html.parser')
|
78 |
+
result = self.scraper._extract_article(soup, 'snopes.com')
|
79 |
+
|
80 |
+
expected_content = """
|
81 |
+
## The Claim
|
82 |
+
|
83 |
+
This is the **main claim** being tested.
|
84 |
+
|
85 |
+
## The Facts
|
86 |
+
|
87 |
+
• First important fact with _emphasis_
|
88 |
+
• Second fact with a [source](source.com)
|
89 |
+
|
90 |
+
Additional **important** context.""".strip()
|
91 |
+
|
92 |
+
self.assertEqual(result['headline'], 'Fact Check: Test Claim')
|
93 |
+
# Normalize whitespace for comparison
|
94 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
95 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
96 |
+
self.assertEqual(actual_content, expected_content)
|
97 |
+
|
98 |
+
def test_extract_politifact_article(self):
|
99 |
+
"""Test extraction of a PolitiFact-style article with formatting."""
|
100 |
+
html = """
|
101 |
+
<html>
|
102 |
+
<body>
|
103 |
+
<h1 class="article__title">Test Political Claim</h1>
|
104 |
+
<article class="article">
|
105 |
+
<div class="article__text">
|
106 |
+
<p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
|
107 |
+
<h3>Our Analysis</h3>
|
108 |
+
<ul>
|
109 |
+
<li>Evidence point 1</li>
|
110 |
+
<li>Evidence point 2 with <a href="proof.com">proof</a></li>
|
111 |
+
</ul>
|
112 |
+
<p>Final assessment with <strong>key points</strong>.</p>
|
113 |
+
</div>
|
114 |
+
</article>
|
115 |
+
</body>
|
116 |
+
</html>
|
117 |
+
"""
|
118 |
+
soup = BeautifulSoup(html, 'html.parser')
|
119 |
+
result = self.scraper._extract_article(soup, 'politifact.com')
|
120 |
+
|
121 |
+
expected_content = """
|
122 |
+
Here's a claim with **bold text** and _italics_.
|
123 |
+
|
124 |
+
### Our Analysis
|
125 |
+
|
126 |
+
• Evidence point 1
|
127 |
+
• Evidence point 2 with [proof](proof.com)
|
128 |
+
|
129 |
+
Final assessment with **key points**.""".strip()
|
130 |
+
|
131 |
+
self.assertEqual(result['headline'], 'Test Political Claim')
|
132 |
+
# Normalize whitespace for comparison
|
133 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
134 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
135 |
+
self.assertEqual(actual_content, expected_content)
|
136 |
+
|
137 |
+
def test_extract_generic_article(self):
|
138 |
+
"""Test extraction of a generic article with formatting."""
|
139 |
+
html = """
|
140 |
+
<html>
|
141 |
+
<body>
|
142 |
+
<h1>Generic Article Title</h1>
|
143 |
+
<main>
|
144 |
+
<p>Opening paragraph with <strong>bold</strong> text.</p>
|
145 |
+
<div class="content">
|
146 |
+
<h2>Section Title</h2>
|
147 |
+
<p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
|
148 |
+
<ul>
|
149 |
+
<li>Point <strong>one</strong></li>
|
150 |
+
<li>Point <em>two</em></li>
|
151 |
+
</ul>
|
152 |
+
</div>
|
153 |
+
</main>
|
154 |
+
</body>
|
155 |
+
</html>
|
156 |
+
"""
|
157 |
+
soup = BeautifulSoup(html, 'html.parser')
|
158 |
+
result = self.scraper._extract_article(soup, 'generic.com')
|
159 |
+
|
160 |
+
expected_content = """
|
161 |
+
Opening paragraph with **bold** text.
|
162 |
+
|
163 |
+
## Section Title
|
164 |
+
|
165 |
+
Content with _italic_ text and [reference](ref.com).
|
166 |
+
|
167 |
+
• Point **one**
|
168 |
+
• Point _two_""".strip()
|
169 |
+
|
170 |
+
self.assertEqual(result['headline'], 'Generic Article Title')
|
171 |
+
# Normalize whitespace for comparison
|
172 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
173 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
174 |
+
self.assertEqual(actual_content, expected_content)
|
175 |
+
|
176 |
+
if __name__ == '__main__':
|
177 |
+
unittest.main()
|
tests/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Test package initialization
|
tests/test_article_scraper.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from ..scrapers.article_scraper import ArticleScraper
|
4 |
+
|
5 |
+
class TestArticleScraper(unittest.TestCase):
|
6 |
+
def setUp(self):
|
7 |
+
self.scraper = ArticleScraper()
|
8 |
+
|
9 |
+
def test_process_element_formatting(self):
|
10 |
+
"""Test that _process_element preserves various HTML formatting."""
|
11 |
+
# Test complex nested HTML with multiple formatting elements
|
12 |
+
html = """
|
13 |
+
<div>
|
14 |
+
<h1>Main Title</h1>
|
15 |
+
<p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
|
16 |
+
<p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
|
17 |
+
<ul>
|
18 |
+
<li>First <strong>important</strong> item</li>
|
19 |
+
<li>Second item with <em>emphasis</em></li>
|
20 |
+
</ul>
|
21 |
+
<ol>
|
22 |
+
<li>Numbered item <a href="test.com">with link</a></li>
|
23 |
+
<li>Another numbered item</li>
|
24 |
+
</ol>
|
25 |
+
<div>
|
26 |
+
Nested <br/>content with<br />line breaks
|
27 |
+
</div>
|
28 |
+
</div>
|
29 |
+
"""
|
30 |
+
soup = BeautifulSoup(html, 'html.parser')
|
31 |
+
formatted_content = self.scraper._process_element(soup.div)
|
32 |
+
|
33 |
+
expected_output = """
|
34 |
+
## Main Title
|
35 |
+
|
36 |
+
This is a **bold** and _italic_ text.
|
37 |
+
|
38 |
+
This is a [link](https://example.com) in a paragraph.
|
39 |
+
|
40 |
+
• First **important** item
|
41 |
+
• Second item with _emphasis_
|
42 |
+
|
43 |
+
1. Numbered item [with link](test.com)
|
44 |
+
2. Another numbered item
|
45 |
+
|
46 |
+
Nested
|
47 |
+
content with
|
48 |
+
line breaks""".strip()
|
49 |
+
|
50 |
+
# Normalize whitespace for comparison
|
51 |
+
formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
|
52 |
+
expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
|
53 |
+
|
54 |
+
self.assertEqual(formatted_content, expected_output)
|
55 |
+
|
56 |
+
def test_extract_snopes_article(self):
|
57 |
+
"""Test extraction of a Snopes-style article with formatting."""
|
58 |
+
html = """
|
59 |
+
<html>
|
60 |
+
<body>
|
61 |
+
<header>
|
62 |
+
<h1>Fact Check: Test Claim</h1>
|
63 |
+
</header>
|
64 |
+
<article>
|
65 |
+
<h2>The Claim</h2>
|
66 |
+
<p>This is the <strong>main claim</strong> being tested.</p>
|
67 |
+
<h2>The Facts</h2>
|
68 |
+
<ul>
|
69 |
+
<li>First important fact with <em>emphasis</em></li>
|
70 |
+
<li>Second fact with a <a href="source.com">source</a></li>
|
71 |
+
</ul>
|
72 |
+
<p>Additional <strong>important</strong> context.</p>
|
73 |
+
</article>
|
74 |
+
</body>
|
75 |
+
</html>
|
76 |
+
"""
|
77 |
+
soup = BeautifulSoup(html, 'html.parser')
|
78 |
+
result = self.scraper._extract_article(soup, 'snopes.com')
|
79 |
+
|
80 |
+
expected_content = """
|
81 |
+
## The Claim
|
82 |
+
|
83 |
+
This is the **main claim** being tested.
|
84 |
+
|
85 |
+
## The Facts
|
86 |
+
|
87 |
+
• First important fact with _emphasis_
|
88 |
+
• Second fact with a [source](source.com)
|
89 |
+
|
90 |
+
Additional **important** context.""".strip()
|
91 |
+
|
92 |
+
self.assertEqual(result['headline'], 'Fact Check: Test Claim')
|
93 |
+
# Normalize whitespace for comparison
|
94 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
95 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
96 |
+
self.assertEqual(actual_content, expected_content)
|
97 |
+
|
98 |
+
def test_extract_politifact_article(self):
|
99 |
+
"""Test extraction of a PolitiFact-style article with formatting."""
|
100 |
+
html = """
|
101 |
+
<html>
|
102 |
+
<body>
|
103 |
+
<h1 class="article__title">Test Political Claim</h1>
|
104 |
+
<article class="article">
|
105 |
+
<div class="article__text">
|
106 |
+
<p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
|
107 |
+
<h3>Our Analysis</h3>
|
108 |
+
<ul>
|
109 |
+
<li>Evidence point 1</li>
|
110 |
+
<li>Evidence point 2 with <a href="proof.com">proof</a></li>
|
111 |
+
</ul>
|
112 |
+
<p>Final assessment with <strong>key points</strong>.</p>
|
113 |
+
</div>
|
114 |
+
</article>
|
115 |
+
</body>
|
116 |
+
</html>
|
117 |
+
"""
|
118 |
+
soup = BeautifulSoup(html, 'html.parser')
|
119 |
+
result = self.scraper._extract_article(soup, 'politifact.com')
|
120 |
+
|
121 |
+
expected_content = """
|
122 |
+
Here's a claim with **bold text** and _italics_.
|
123 |
+
|
124 |
+
### Our Analysis
|
125 |
+
|
126 |
+
• Evidence point 1
|
127 |
+
• Evidence point 2 with [proof](proof.com)
|
128 |
+
|
129 |
+
Final assessment with **key points**.""".strip()
|
130 |
+
|
131 |
+
self.assertEqual(result['headline'], 'Test Political Claim')
|
132 |
+
# Normalize whitespace for comparison
|
133 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
134 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
135 |
+
self.assertEqual(actual_content, expected_content)
|
136 |
+
|
137 |
+
def test_extract_generic_article(self):
|
138 |
+
"""Test extraction of a generic article with formatting."""
|
139 |
+
html = """
|
140 |
+
<html>
|
141 |
+
<body>
|
142 |
+
<h1>Generic Article Title</h1>
|
143 |
+
<main>
|
144 |
+
<p>Opening paragraph with <strong>bold</strong> text.</p>
|
145 |
+
<div class="content">
|
146 |
+
<h2>Section Title</h2>
|
147 |
+
<p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
|
148 |
+
<ul>
|
149 |
+
<li>Point <strong>one</strong></li>
|
150 |
+
<li>Point <em>two</em></li>
|
151 |
+
</ul>
|
152 |
+
</div>
|
153 |
+
</main>
|
154 |
+
</body>
|
155 |
+
</html>
|
156 |
+
"""
|
157 |
+
soup = BeautifulSoup(html, 'html.parser')
|
158 |
+
result = self.scraper._extract_article(soup, 'generic.com')
|
159 |
+
|
160 |
+
expected_content = """
|
161 |
+
Opening paragraph with **bold** text.
|
162 |
+
|
163 |
+
## Section Title
|
164 |
+
|
165 |
+
Content with _italic_ text and [reference](ref.com).
|
166 |
+
|
167 |
+
• Point **one**
|
168 |
+
• Point _two_""".strip()
|
169 |
+
|
170 |
+
self.assertEqual(result['headline'], 'Generic Article Title')
|
171 |
+
# Normalize whitespace for comparison
|
172 |
+
actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
|
173 |
+
expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
|
174 |
+
self.assertEqual(actual_content, expected_content)
|
175 |
+
|
176 |
+
if __name__ == '__main__':
|
177 |
+
unittest.main()
|