Spaces:

AiDeveloper1
/

RCS

Sleeping

App Files Files Community

AiDeveloper1 commited on May 19

Commit

5908412

verified ·

1 Parent(s): 0bebdd8

Upload 3 files

Browse files

Files changed (3) hide show

rich_card_builder.py +71 -0
scraper.py +73 -0
summarizer.py +76 -0

rich_card_builder.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def select_relevant_image(images: list, text: str) -> str:
+    """Select a contextually relevant image from the list based on text."""
+    if not images:
+        logging.info("No images available, using empty media URL.")
+        return ""
+    # Prioritize images that are likely content-related (avoid logos, icons)
+    for img in images:
+        if not any(keyword in img.lower() for keyword in ["logo", "icon", "banner", "ad"]):
+            logging.info(f"Selected image: {img}")
+            return img
+    # Fallback to first image if no clear content image
+    logging.info(f"No content image found, using first image: {images[0]}")
+    return images[0]
+def build_rich_card(scraped_data: dict, summary: dict) -> dict:
+    """Build the rich card JSON using only scraped data and summary."""
+    logging.info(f"Building rich card with scraped_data: {scraped_data}, summary: {summary}")
+    # Select relevant image
+    media_url = select_relevant_image(scraped_data.get("images", []), scraped_data.get("text", ""))
+    # Use scraped URL
+    page_url = scraped_data.get("url", "")
+    # Use summary description
+    description = summary.get("description", "Explore news and insights.")
+    rich_card = {
+        "targets": [{"ids": [1368], "targetType": "humans"}],
+        "text": description,
+        "mediaType": "image",
+        "media": media_url,
+        "buttons": [
+            {
+                "type": "weburl",
+                "title": "View Now",
+                "payload": page_url
+            },
+            {
+                "type": "postback",
+                "title": "Learn More",
+                "payload": "learn_more",
+                "execute": None
+            }
+        ],
+        "quickReplies": [
+            {
+                "type": "postback",
+                "title": "Show Similar",
+                "payload": "similar_content"
+            },
+            {
+                "type": "call",
+                "title": "Contact Support",
+                "payload": "+12345678901"
+            }
+        ],
+        "richCard": {
+            "cardOrientation": "VERTICAL",
+            "mediaHeight": "MEDIUM"
+        }
+    }
+    logging.info(f"Generated rich card: {rich_card}")
+    return rich_card

scraper.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from playwright.async_api import async_playwright
+from urllib.parse import urljoin, urlparse
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[dict, set]:
+    """Scrape a single page for text, images, and links using Playwright."""
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+                viewport={"width": 1280, "height": 720}
+            )
+            page = await context.new_page()
+            await page.goto(url, wait_until="networkidle", timeout=30000)
+            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            await page.wait_for_timeout(2000)
+            # Extract text content
+            text_content = await page.evaluate("document.body.innerText")
+            text_content = ' '.join(text_content.split()) if text_content else ""
+            # Extract images (only JPEG, PNG, WebP, exclude data URLs and SVGs)
+            images = await page.evaluate(
+                """() => {
+                    const validExtensions = ['.jpg', '.jpeg', '.png', '.webp'];
+                    const imgElements = document.querySelectorAll('img');
+                    const imgUrls = new Set();
+                    imgElements.forEach(img => {
+                        const src = img.src || '';
+                        const dataSrc = img.dataset.src || '';
+                        const srcset = img.srcset || '';
+                        // Check src
+                        if (src && !src.startsWith('data:') && validExtensions.some(ext => src.toLowerCase().endsWith(ext))) {
+                            imgUrls.add(src);
+                        }
+                        // Check data-src
+                        if (dataSrc && !dataSrc.startsWith('data:') && validExtensions.some(ext => dataSrc.toLowerCase().endsWith(ext))) {
+                            imgUrls.add(dataSrc);
+                        }
+                        // Check srcset
+                        if (srcset) {
+                            srcset.split(',').forEach(src => {
+                                const url = src.trim().split(' ')[0];
+                                if (url && !url.startsWith('data:') && validExtensions.some(ext => url.toLowerCase().endsWith(ext))) {
+                                    imgUrls.add(url);
+                                }
+                            });
+                        }
+                    });
+                    return Array.from(imgUrls);
+                }"""
+            )
+            images = [urljoin(url, img) for img in images if img]
+            # Extract links
+            links = await page.evaluate("Array.from(document.querySelectorAll('a')).map(a => a.href)")
+            links = set(urljoin(url, link) for link in links
+                        if urlparse(urljoin(url, link)).netloc == base_domain
+                        and urljoin(url, link) not in visited)
+            await browser.close()
+        page_data = {"url": url, "text": text_content, "images": images}
+        logging.info(f"Scraped data: url={url}, text_length={len(text_content)}, images={images}")
+        return page_data, links
+    except Exception as e:
+        logging.error(f"Error scraping {url}: {e}")
+        return {}, set()

summarizer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from openai import AsyncOpenAI
+import logging
+from dotenv import load_dotenv
+from urllib.parse import urlparse
+# Load environment variables
+load_dotenv()
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+async def summarize_text(text: str, url: str = "") -> dict:
+    """Summarize text into a title and description using OpenAI's API."""
+    try:
+        # Get OpenAI API key
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            logging.error("OpenAI API key not found. Please set OPENAI_API_KEY in .env file.")
+            raise ValueError("OpenAI API key is required for summarization.")
+        # Initialize OpenAI client
+        client = AsyncOpenAI(api_key=api_key)
+        # Handle empty or short text
+        if not text or len(text.strip()) < 20:
+            logging.warning(f"Input text is empty or too short: '{text}'. Using URL context.")
+            text = f"Content from {url} about news, products, or services."
+        # Simplified prompt
+        prompt = (
+            f"Summarize the following text into a title (up to 50 characters) and a description (up to 100 characters) "
+            f"for RCS messaging. Ensure both are complete and relevant. If the text is short, use the URL ({url}) "
+            f"to infer context for a news, product, or service site. Format the output as:\nTitle: [title]\nDescription: [description]\n\n{text}"
+        )
+        response = await client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": "You are an assistant crafting summaries for RCS messaging."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.6
+        )
+        # Log raw response
+        raw_content = response.choices[0].message.content.strip()
+        logging.info(f"Raw LLM response: {raw_content}")
+        # Parse response
+        lines = raw_content.split("\n")
+        title = "News Summary"
+        description = f"Explore content from {urlparse(url).netloc}."[:100]
+        for line in lines:
+            if line.startswith("Title:"):
+                title = line.replace("Title:", "").strip()[:50]
+            elif line.startswith("Description:"):
+                description = line.replace("Description:", "").strip()[:100]
+        # Ensure non-empty description
+        if not description or description == f"Explore content from {urlparse(url).netloc}.":
+            logging.warning("Description is empty or default. Using fallback.")
+            description = f"Discover news and insights from {urlparse(url).netloc}."[:100]
+        logging.info(f"Parsed summary - Title: {title}, Description: {description}")
+        return {"title": title, "description": description}
+    except Exception as e:
+        logging.error(f"Error summarizing text with OpenAI: {e}")
+        # Fallback
+        description = f"Discover news and insights from {urlparse(url).netloc}."[:100]
+        logging.info(f"Using fallback - Title: News Summary, Description: {description}")
+        return {
+            "title": "News Summary",
+            "description": description
+        }