AiDeveloper1 commited on
Commit
5908412
·
verified ·
1 Parent(s): 0bebdd8

Upload 3 files

Browse files
Files changed (3) hide show
  1. rich_card_builder.py +71 -0
  2. scraper.py +73 -0
  3. summarizer.py +76 -0
rich_card_builder.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Set up logging
4
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
5
+
6
+ def select_relevant_image(images: list, text: str) -> str:
7
+ """Select a contextually relevant image from the list based on text."""
8
+ if not images:
9
+ logging.info("No images available, using empty media URL.")
10
+ return ""
11
+
12
+ # Prioritize images that are likely content-related (avoid logos, icons)
13
+ for img in images:
14
+ if not any(keyword in img.lower() for keyword in ["logo", "icon", "banner", "ad"]):
15
+ logging.info(f"Selected image: {img}")
16
+ return img
17
+
18
+ # Fallback to first image if no clear content image
19
+ logging.info(f"No content image found, using first image: {images[0]}")
20
+ return images[0]
21
+
22
+ def build_rich_card(scraped_data: dict, summary: dict) -> dict:
23
+ """Build the rich card JSON using only scraped data and summary."""
24
+ logging.info(f"Building rich card with scraped_data: {scraped_data}, summary: {summary}")
25
+
26
+ # Select relevant image
27
+ media_url = select_relevant_image(scraped_data.get("images", []), scraped_data.get("text", ""))
28
+
29
+ # Use scraped URL
30
+ page_url = scraped_data.get("url", "")
31
+
32
+ # Use summary description
33
+ description = summary.get("description", "Explore news and insights.")
34
+
35
+ rich_card = {
36
+ "targets": [{"ids": [1368], "targetType": "humans"}],
37
+ "text": description,
38
+ "mediaType": "image",
39
+ "media": media_url,
40
+ "buttons": [
41
+ {
42
+ "type": "weburl",
43
+ "title": "View Now",
44
+ "payload": page_url
45
+ },
46
+ {
47
+ "type": "postback",
48
+ "title": "Learn More",
49
+ "payload": "learn_more",
50
+ "execute": None
51
+ }
52
+ ],
53
+ "quickReplies": [
54
+ {
55
+ "type": "postback",
56
+ "title": "Show Similar",
57
+ "payload": "similar_content"
58
+ },
59
+ {
60
+ "type": "call",
61
+ "title": "Contact Support",
62
+ "payload": "+12345678901"
63
+ }
64
+ ],
65
+ "richCard": {
66
+ "cardOrientation": "VERTICAL",
67
+ "mediaHeight": "MEDIUM"
68
+ }
69
+ }
70
+ logging.info(f"Generated rich card: {rich_card}")
71
+ return rich_card
scraper.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from playwright.async_api import async_playwright
2
+ from urllib.parse import urljoin, urlparse
3
+ import logging
4
+
5
+ # Set up logging
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[dict, set]:
9
+ """Scrape a single page for text, images, and links using Playwright."""
10
+ try:
11
+ async with async_playwright() as p:
12
+ browser = await p.chromium.launch(headless=True)
13
+ context = await browser.new_context(
14
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
15
+ viewport={"width": 1280, "height": 720}
16
+ )
17
+ page = await context.new_page()
18
+ await page.goto(url, wait_until="networkidle", timeout=30000)
19
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
20
+ await page.wait_for_timeout(2000)
21
+
22
+ # Extract text content
23
+ text_content = await page.evaluate("document.body.innerText")
24
+ text_content = ' '.join(text_content.split()) if text_content else ""
25
+
26
+ # Extract images (only JPEG, PNG, WebP, exclude data URLs and SVGs)
27
+ images = await page.evaluate(
28
+ """() => {
29
+ const validExtensions = ['.jpg', '.jpeg', '.png', '.webp'];
30
+ const imgElements = document.querySelectorAll('img');
31
+ const imgUrls = new Set();
32
+ imgElements.forEach(img => {
33
+ const src = img.src || '';
34
+ const dataSrc = img.dataset.src || '';
35
+ const srcset = img.srcset || '';
36
+ // Check src
37
+ if (src && !src.startsWith('data:') && validExtensions.some(ext => src.toLowerCase().endsWith(ext))) {
38
+ imgUrls.add(src);
39
+ }
40
+ // Check data-src
41
+ if (dataSrc && !dataSrc.startsWith('data:') && validExtensions.some(ext => dataSrc.toLowerCase().endsWith(ext))) {
42
+ imgUrls.add(dataSrc);
43
+ }
44
+ // Check srcset
45
+ if (srcset) {
46
+ srcset.split(',').forEach(src => {
47
+ const url = src.trim().split(' ')[0];
48
+ if (url && !url.startsWith('data:') && validExtensions.some(ext => url.toLowerCase().endsWith(ext))) {
49
+ imgUrls.add(url);
50
+ }
51
+ });
52
+ }
53
+ });
54
+ return Array.from(imgUrls);
55
+ }"""
56
+ )
57
+ images = [urljoin(url, img) for img in images if img]
58
+
59
+ # Extract links
60
+ links = await page.evaluate("Array.from(document.querySelectorAll('a')).map(a => a.href)")
61
+ links = set(urljoin(url, link) for link in links
62
+ if urlparse(urljoin(url, link)).netloc == base_domain
63
+ and urljoin(url, link) not in visited)
64
+
65
+ await browser.close()
66
+
67
+ page_data = {"url": url, "text": text_content, "images": images}
68
+ logging.info(f"Scraped data: url={url}, text_length={len(text_content)}, images={images}")
69
+ return page_data, links
70
+
71
+ except Exception as e:
72
+ logging.error(f"Error scraping {url}: {e}")
73
+ return {}, set()
summarizer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import AsyncOpenAI
3
+ import logging
4
+ from dotenv import load_dotenv
5
+ from urllib.parse import urlparse
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+ async def summarize_text(text: str, url: str = "") -> dict:
14
+ """Summarize text into a title and description using OpenAI's API."""
15
+ try:
16
+ # Get OpenAI API key
17
+ api_key = os.getenv("OPENAI_API_KEY")
18
+ if not api_key:
19
+ logging.error("OpenAI API key not found. Please set OPENAI_API_KEY in .env file.")
20
+ raise ValueError("OpenAI API key is required for summarization.")
21
+
22
+ # Initialize OpenAI client
23
+ client = AsyncOpenAI(api_key=api_key)
24
+
25
+ # Handle empty or short text
26
+ if not text or len(text.strip()) < 20:
27
+ logging.warning(f"Input text is empty or too short: '{text}'. Using URL context.")
28
+ text = f"Content from {url} about news, products, or services."
29
+
30
+ # Simplified prompt
31
+ prompt = (
32
+ f"Summarize the following text into a title (up to 50 characters) and a description (up to 100 characters) "
33
+ f"for RCS messaging. Ensure both are complete and relevant. If the text is short, use the URL ({url}) "
34
+ f"to infer context for a news, product, or service site. Format the output as:\nTitle: [title]\nDescription: [description]\n\n{text}"
35
+ )
36
+ response = await client.chat.completions.create(
37
+ model="gpt-4o-mini",
38
+ messages=[
39
+ {"role": "system", "content": "You are an assistant crafting summaries for RCS messaging."},
40
+ {"role": "user", "content": prompt}
41
+ ],
42
+ temperature=0.6
43
+ )
44
+
45
+ # Log raw response
46
+ raw_content = response.choices[0].message.content.strip()
47
+ logging.info(f"Raw LLM response: {raw_content}")
48
+
49
+ # Parse response
50
+ lines = raw_content.split("\n")
51
+ title = "News Summary"
52
+ description = f"Explore content from {urlparse(url).netloc}."[:100]
53
+
54
+ for line in lines:
55
+ if line.startswith("Title:"):
56
+ title = line.replace("Title:", "").strip()[:50]
57
+ elif line.startswith("Description:"):
58
+ description = line.replace("Description:", "").strip()[:100]
59
+
60
+ # Ensure non-empty description
61
+ if not description or description == f"Explore content from {urlparse(url).netloc}.":
62
+ logging.warning("Description is empty or default. Using fallback.")
63
+ description = f"Discover news and insights from {urlparse(url).netloc}."[:100]
64
+
65
+ logging.info(f"Parsed summary - Title: {title}, Description: {description}")
66
+ return {"title": title, "description": description}
67
+
68
+ except Exception as e:
69
+ logging.error(f"Error summarizing text with OpenAI: {e}")
70
+ # Fallback
71
+ description = f"Discover news and insights from {urlparse(url).netloc}."[:100]
72
+ logging.info(f"Using fallback - Title: News Summary, Description: {description}")
73
+ return {
74
+ "title": "News Summary",
75
+ "description": description
76
+ }