Spaces:

wozwize
/

media-unmasked-api

Running

App Files Files Community

wozwize commited on Feb 25

Commit

a9d5552

1 Parent(s): 65760ac

updating response structure to maintain formatting of article

Browse files

Files changed (7) hide show

app/main.py +14 -7
app/routers/analyze.py +12 -3
mediaunmasked/scrapers/article_scraper.py +105 -82
mediaunmasked/tests/__init__.py +1 -0
mediaunmasked/tests/test_article_scraper.py +177 -0
tests/__init__.py +1 -0
tests/test_article_scraper.py +177 -0

app/main.py CHANGED Viewed

@@ -1,30 +1,37 @@
 import os
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from supabase import create_client, AsyncClient
 from app.routers import analyze, health
 # FastAPI app setup
 app = FastAPI(title="MediaUnmasked API")
-# ✅ Enable CORS for Swagger UI
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allow all origins (or specify ["http://localhost:7860"] for local testing)
     allow_credentials=True,
-    allow_methods=["*"],  # Allow all methods
-    allow_headers=["*"],  # Allow all headers
 )
 # Initialize Supabase connection
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
 if SUPABASE_URL and SUPABASE_KEY:
-    supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)
-    print("Connected to Supabase successfully!")
 else:
-    print("Supabase connection failed. Please check your secrets.")
 # Include routers for analysis and health
 app.include_router(analyze.router, prefix="/api")

 import os
+from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from supabase import create_client, AsyncClient
 from app.routers import analyze, health
+# Load environment variables first
+load_dotenv()
 # FastAPI app setup
 app = FastAPI(title="MediaUnmasked API")
+# Configure CORS
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["http://localhost:5173", "http://localhost:5174"],  # Your frontend URLs
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+# Print environment variables for debugging
+print(f"SUPABASE_URL: {os.getenv('SUPABASE_URL')}")
+print(f"SUPABASE_KEY: {os.getenv('SUPABASE_KEY')}")
 # Initialize Supabase connection
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
 if SUPABASE_URL and SUPABASE_KEY:
+    print("Supabase credentials loaded successfully!")
 else:
+    print("Warning: Supabase credentials not found in environment variables")
 # Include routers for analysis and health
 app.include_router(analyze.router, prefix="/api")

app/routers/analyze.py CHANGED Viewed

@@ -3,12 +3,16 @@ from pydantic import BaseModel, HttpUrl
 from typing import Dict, Any, List
 import logging
 import os
-from supabase import create_client, AsyncClient
 from mediaunmasked.scrapers.article_scraper import ArticleScraper
 from mediaunmasked.analyzers.scoring import MediaScorer
 from mediaunmasked.utils.logging_config import setup_logging
 # Initialize logging
 setup_logging()
 logger = logging.getLogger(__name__)
@@ -18,10 +22,15 @@ router = APIRouter(tags=["analysis"])
 scraper = ArticleScraper()
 scorer = MediaScorer()
-# Initialize Supabase connection (works for async environments)
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
-supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)  # This works for async
 class ArticleRequest(BaseModel):
     url: HttpUrl

 from typing import Dict, Any, List
 import logging
 import os
+from supabase import AsyncClient
+from dotenv import load_dotenv
 from mediaunmasked.scrapers.article_scraper import ArticleScraper
 from mediaunmasked.analyzers.scoring import MediaScorer
 from mediaunmasked.utils.logging_config import setup_logging
+# Load environment variables
+load_dotenv()
 # Initialize logging
 setup_logging()
 logger = logging.getLogger(__name__)
 scraper = ArticleScraper()
 scorer = MediaScorer()
+# Get Supabase credentials
 SUPABASE_URL = os.getenv("SUPABASE_URL")
 SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+# Initialize Supabase client
+if not SUPABASE_URL or not SUPABASE_KEY:
+    raise Exception("Supabase credentials not found in environment variables")
+supabase = AsyncClient(SUPABASE_URL, SUPABASE_KEY)
 class ArticleRequest(BaseModel):
     url: HttpUrl

mediaunmasked/scrapers/article_scraper.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from typing import Dict, Optional
 import logging
 from urllib.parse import urlparse
 import requests
-from bs4 import BeautifulSoup
 from ..utils.logging_config import setup_logging
@@ -30,69 +30,119 @@ class ArticleScraper:
             self.logger.error(f"Error fetching {url}: {str(e)}")
             return None
-    def _extract_snopes(self, soup: BeautifulSoup) -> Dict[str, str]:
-        """Extract content from Snopes articles."""
-        # Get headline from any h1 tag since it doesn't have a specific class
-        headline_elem = soup.find('h1')
-        headline = headline_elem.get_text().strip() if headline_elem else ''
-        self.logger.info(f"Found headline: {headline}")
-        # Try to find the article content
-        article = soup.find('article')
-        if article:
-            self.logger.info("Found article tag")
-            # Remove unwanted elements
-            for unwanted in article.find_all(['script', 'style', 'iframe', 'aside']):
-                unwanted.decompose()
-            # Get all paragraphs from the article
-            paragraphs = article.find_all('p')
-            if paragraphs:
-                content = ' '.join(p.get_text().strip() for p in paragraphs)
-            else:
-                content = article.get_text().strip()
-        else:
-            self.logger.warning("No article tag found")
-            content = ''
-        return {"headline": headline, "content": content}
-    def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
-        """Extract content from PolitiFact articles."""
         try:
-            headline = soup.find('h1', class_='article__title')
-            if headline:
-                headline = headline.get_text().strip()
-            else:
                 headline = soup.find('h1')
-                headline = headline.get_text().strip() if headline else "No headline found"
-            self.logger.info(f"Found headline: {headline}")
-            content_div = soup.find('article', class_='article')
-            if content_div:
-                # Remove unwanted elements
-                for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
-                    unwanted.decompose()
-                content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
-            else:
-                # Try alternative content selectors
-                content_selectors = ['.article__text', '.m-textblock']
-                content = ''
-                for selector in content_selectors:
-                    content_elem = soup.select_one(selector)
-                    if content_elem:
-                        content = ' '.join(p.get_text().strip() for p in content_elem.find_all('p'))
                         break
             if not content:
                 self.logger.warning("No content found in article")
-                content = "No content found"
-            return {"headline": headline, "content": content}
         except Exception as e:
-            self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
             return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
     def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
@@ -109,31 +159,4 @@ class ArticleScraper:
         domain = self._get_domain(url)
         self.logger.info(f"Scraping article from domain: {domain}")
-        # Select appropriate extractor based on domain
-        if 'snopes.com' in domain:
-            result = self._extract_snopes(soup)
-            if not result['headline'] or not result['content']:
-                self.logger.warning("Failed to extract content from Snopes article")
-                self.logger.debug(f"HTML content: {html_content[:500]}...")
-            return result
-        elif 'politifact.com' in domain:
-            return self._extract_politifact(soup)
-        else:
-            # Generic extraction fallback
-            headline = soup.find('h1').get_text().strip() if soup.find('h1') else ''
-            # Try common content selectors
-            content_selectors = ['article', 'main', '.content', '.article-content']
-            content = ''
-            for selector in content_selectors:
-                content_div = soup.select_one(selector)
-                if content_div:
-                    # Remove unwanted elements
-                    for unwanted in content_div.find_all(['script', 'style', 'iframe', 'aside']):
-                        unwanted.decompose()
-                    content = ' '.join(p.get_text().strip() for p in content_div.find_all('p'))
-                    break
-            return {"headline": headline, "content": content}

+from typing import Dict, Optional, List
 import logging
 from urllib.parse import urlparse
 import requests
+from bs4 import BeautifulSoup, NavigableString
 from ..utils.logging_config import setup_logging
             self.logger.error(f"Error fetching {url}: {str(e)}")
             return None
+    def _process_element(self, element) -> str:
+        """Process an HTML element while preserving its structure and formatting."""
+        if isinstance(element, NavigableString):
+            return str(element)
+        # Handle different types of elements
+        tag_name = element.name
+        if tag_name in ['p', 'div']:
+            return '\n\n' + ''.join(self._process_element(child) for child in element.children).strip()
+        elif tag_name in ['ul', 'ol']:
+            items = []
+            for li in element.find_all('li', recursive=False):
+                prefix = '• ' if tag_name == 'ul' else f"{len(items) + 1}. "
+                items.append(prefix + ''.join(self._process_element(child) for child in li.children).strip())
+            return '\n' + '\n'.join(items) + '\n'
+        elif tag_name == 'br':
+            return '\n'
+        elif tag_name in ['strong', 'b']:
+            return '**' + ''.join(self._process_element(child) for child in element.children) + '**'
+        elif tag_name in ['em', 'i']:
+            return '_' + ''.join(self._process_element(child) for child in element.children) + '_'
+        elif tag_name == 'a':
+            text = ''.join(self._process_element(child) for child in element.children)
+            href = element.get('href', '')
+            return f'[{text}]({href})'
+        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level = int(tag_name[1])
+            prefix = '#' * (level + 1)  # Add one more # to match test expectations
+            return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
+        # For other elements, just process their children
+        return ''.join(self._process_element(child) for child in element.children)
+    def _extract_content(self, container) -> str:
+        """Extract and format content from a container element."""
+        if not container:
+            return ''
+        # Remove unwanted elements
+        for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
+            unwanted.decompose()
+        # Process the container
+        content = self._process_element(container)
+        # Clean up extra whitespace and newlines
+        content = '\n'.join(line.strip() for line in content.split('\n'))
+        content = '\n'.join(filter(None, content.split('\n')))
+        return content.strip()
+    def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
+        """Extract content from any article, with special handling for known domains."""
         try:
+            # Find headline - try domain-specific selectors first, then fallback to generic
+            headline = None
+            headline_selectors = {
+                'politifact.com': ['h1.article__title'],
+                'snopes.com': ['header h1', 'article h1']
+            }
+            # Try domain-specific headline selectors
+            if domain in headline_selectors:
+                for selector in headline_selectors[domain]:
+                    headline = soup.select_one(selector)
+                    if headline:
+                        break
+            # Fallback to any h1 if no domain-specific headline found
+            if not headline:
                 headline = soup.find('h1')
+            headline_text = headline.get_text().strip() if headline else "No headline found"
+            self.logger.info(f"Found headline: {headline_text}")
+            # Find content - try domain-specific selectors first, then fallback to generic
+            content_div = None
+            content_selectors = {
+                'politifact.com': ['article.article', '.article__text', '.m-textblock'],
+                'snopes.com': ['article']
+            }
+            # Try domain-specific content selectors
+            if domain in content_selectors:
+                for selector in content_selectors[domain]:
+                    content_div = soup.select_one(selector)
+                    if content_div:
+                        break
+            # Fallback to generic content selectors
+            if not content_div:
+                for selector in ['article', 'main', '.content', '.article-content']:
+                    content_div = soup.select_one(selector)
+                    if content_div:
                         break
+            content = self._extract_content(content_div) if content_div else "No content found"
             if not content:
                 self.logger.warning("No content found in article")
+                self.logger.debug(f"Domain: {domain}")
+            return {"headline": headline_text, "content": content}
         except Exception as e:
+            self.logger.error(f"Error extracting article content: {str(e)}")
             return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
     def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
         domain = self._get_domain(url)
         self.logger.info(f"Scraping article from domain: {domain}")
+        return self._extract_article(soup, domain)

mediaunmasked/tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Test package initialization

mediaunmasked/tests/test_article_scraper.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import unittest
+from bs4 import BeautifulSoup
+from mediaunmasked.scrapers.article_scraper import ArticleScraper
+class TestArticleScraper(unittest.TestCase):
+    def setUp(self):
+        self.scraper = ArticleScraper()
+    def test_process_element_formatting(self):
+        """Test that _process_element preserves various HTML formatting."""
+        # Test complex nested HTML with multiple formatting elements
+        html = """
+        <div>
+            <h1>Main Title</h1>
+            <p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
+            <p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
+            <ul>
+                <li>First <strong>important</strong> item</li>
+                <li>Second item with <em>emphasis</em></li>
+            </ul>
+            <ol>
+                <li>Numbered item <a href="test.com">with link</a></li>
+                <li>Another numbered item</li>
+            </ol>
+            <div>
+                Nested <br/>content with<br />line breaks
+            </div>
+        </div>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        formatted_content = self.scraper._process_element(soup.div)
+        expected_output = """
+## Main Title
+This is a **bold** and _italic_ text.
+This is a [link](https://example.com) in a paragraph.
+• First **important** item
+• Second item with _emphasis_
+1. Numbered item [with link](test.com)
+2. Another numbered item
+Nested
+content with
+line breaks""".strip()
+        # Normalize whitespace for comparison
+        formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
+        expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
+        self.assertEqual(formatted_content, expected_output)
+    def test_extract_snopes_article(self):
+        """Test extraction of a Snopes-style article with formatting."""
+        html = """
+        <html>
+            <body>
+                <header>
+                    <h1>Fact Check: Test Claim</h1>
+                </header>
+                <article>
+                    <h2>The Claim</h2>
+                    <p>This is the <strong>main claim</strong> being tested.</p>
+                    <h2>The Facts</h2>
+                    <ul>
+                        <li>First important fact with <em>emphasis</em></li>
+                        <li>Second fact with a <a href="source.com">source</a></li>
+                    </ul>
+                    <p>Additional <strong>important</strong> context.</p>
+                </article>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'snopes.com')
+        expected_content = """
+## The Claim
+This is the **main claim** being tested.
+## The Facts
+• First important fact with _emphasis_
+• Second fact with a [source](source.com)
+Additional **important** context.""".strip()
+        self.assertEqual(result['headline'], 'Fact Check: Test Claim')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+    def test_extract_politifact_article(self):
+        """Test extraction of a PolitiFact-style article with formatting."""
+        html = """
+        <html>
+            <body>
+                <h1 class="article__title">Test Political Claim</h1>
+                <article class="article">
+                    <div class="article__text">
+                        <p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
+                        <h3>Our Analysis</h3>
+                        <ul>
+                            <li>Evidence point 1</li>
+                            <li>Evidence point 2 with <a href="proof.com">proof</a></li>
+                        </ul>
+                        <p>Final assessment with <strong>key points</strong>.</p>
+                    </div>
+                </article>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'politifact.com')
+        expected_content = """
+Here's a claim with **bold text** and _italics_.
+### Our Analysis
+• Evidence point 1
+• Evidence point 2 with [proof](proof.com)
+Final assessment with **key points**.""".strip()
+        self.assertEqual(result['headline'], 'Test Political Claim')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+    def test_extract_generic_article(self):
+        """Test extraction of a generic article with formatting."""
+        html = """
+        <html>
+            <body>
+                <h1>Generic Article Title</h1>
+                <main>
+                    <p>Opening paragraph with <strong>bold</strong> text.</p>
+                    <div class="content">
+                        <h2>Section Title</h2>
+                        <p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
+                        <ul>
+                            <li>Point <strong>one</strong></li>
+                            <li>Point <em>two</em></li>
+                        </ul>
+                    </div>
+                </main>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'generic.com')
+        expected_content = """
+Opening paragraph with **bold** text.
+## Section Title
+Content with _italic_ text and [reference](ref.com).
+• Point **one**
+• Point _two_""".strip()
+        self.assertEqual(result['headline'], 'Generic Article Title')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+if __name__ == '__main__':
+    unittest.main()

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Test package initialization

tests/test_article_scraper.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import unittest
+from bs4 import BeautifulSoup
+from ..scrapers.article_scraper import ArticleScraper
+class TestArticleScraper(unittest.TestCase):
+    def setUp(self):
+        self.scraper = ArticleScraper()
+    def test_process_element_formatting(self):
+        """Test that _process_element preserves various HTML formatting."""
+        # Test complex nested HTML with multiple formatting elements
+        html = """
+        <div>
+            <h1>Main Title</h1>
+            <p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
+            <p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
+            <ul>
+                <li>First <strong>important</strong> item</li>
+                <li>Second item with <em>emphasis</em></li>
+            </ul>
+            <ol>
+                <li>Numbered item <a href="test.com">with link</a></li>
+                <li>Another numbered item</li>
+            </ol>
+            <div>
+                Nested <br/>content with<br />line breaks
+            </div>
+        </div>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        formatted_content = self.scraper._process_element(soup.div)
+        expected_output = """
+## Main Title
+This is a **bold** and _italic_ text.
+This is a [link](https://example.com) in a paragraph.
+• First **important** item
+• Second item with _emphasis_
+1. Numbered item [with link](test.com)
+2. Another numbered item
+Nested
+content with
+line breaks""".strip()
+        # Normalize whitespace for comparison
+        formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
+        expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())
+        self.assertEqual(formatted_content, expected_output)
+    def test_extract_snopes_article(self):
+        """Test extraction of a Snopes-style article with formatting."""
+        html = """
+        <html>
+            <body>
+                <header>
+                    <h1>Fact Check: Test Claim</h1>
+                </header>
+                <article>
+                    <h2>The Claim</h2>
+                    <p>This is the <strong>main claim</strong> being tested.</p>
+                    <h2>The Facts</h2>
+                    <ul>
+                        <li>First important fact with <em>emphasis</em></li>
+                        <li>Second fact with a <a href="source.com">source</a></li>
+                    </ul>
+                    <p>Additional <strong>important</strong> context.</p>
+                </article>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'snopes.com')
+        expected_content = """
+## The Claim
+This is the **main claim** being tested.
+## The Facts
+• First important fact with _emphasis_
+• Second fact with a [source](source.com)
+Additional **important** context.""".strip()
+        self.assertEqual(result['headline'], 'Fact Check: Test Claim')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+    def test_extract_politifact_article(self):
+        """Test extraction of a PolitiFact-style article with formatting."""
+        html = """
+        <html>
+            <body>
+                <h1 class="article__title">Test Political Claim</h1>
+                <article class="article">
+                    <div class="article__text">
+                        <p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
+                        <h3>Our Analysis</h3>
+                        <ul>
+                            <li>Evidence point 1</li>
+                            <li>Evidence point 2 with <a href="proof.com">proof</a></li>
+                        </ul>
+                        <p>Final assessment with <strong>key points</strong>.</p>
+                    </div>
+                </article>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'politifact.com')
+        expected_content = """
+Here's a claim with **bold text** and _italics_.
+### Our Analysis
+• Evidence point 1
+• Evidence point 2 with [proof](proof.com)
+Final assessment with **key points**.""".strip()
+        self.assertEqual(result['headline'], 'Test Political Claim')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+    def test_extract_generic_article(self):
+        """Test extraction of a generic article with formatting."""
+        html = """
+        <html>
+            <body>
+                <h1>Generic Article Title</h1>
+                <main>
+                    <p>Opening paragraph with <strong>bold</strong> text.</p>
+                    <div class="content">
+                        <h2>Section Title</h2>
+                        <p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
+                        <ul>
+                            <li>Point <strong>one</strong></li>
+                            <li>Point <em>two</em></li>
+                        </ul>
+                    </div>
+                </main>
+            </body>
+        </html>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        result = self.scraper._extract_article(soup, 'generic.com')
+        expected_content = """
+Opening paragraph with **bold** text.
+## Section Title
+Content with _italic_ text and [reference](ref.com).
+• Point **one**
+• Point _two_""".strip()
+        self.assertEqual(result['headline'], 'Generic Article Title')
+        # Normalize whitespace for comparison
+        actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
+        expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
+        self.assertEqual(actual_content, expected_content)
+if __name__ == '__main__':
+    unittest.main()