AiDeveloper1 commited on
Commit
95f63e4
·
verified ·
1 Parent(s): 211677f

Upload 5 files

Browse files
Files changed (5) hide show
  1. main.py +338 -0
  2. requirements.txt +0 -0
  3. rich_card_builder.py +71 -0
  4. scraper.py +73 -0
  5. summarizer.py +205 -0
main.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.templating import Jinja2Templates
4
+ from fastapi.staticfiles import StaticFiles
5
+ from pydantic import HttpUrl
6
+ from scraper import scrape_page
7
+ from summarizer import quick_summarize
8
+ from rich_card_builder import build_rich_card
9
+ import asyncio
10
+ from urllib.parse import urlparse
11
+ import logging
12
+ import http.client
13
+ import json
14
+ from dotenv import load_dotenv
15
+ import os
16
+ import google.generativeai as genai
17
+ from typing import Optional, List, Dict
18
+
19
+ # Load environment variables
20
+ load_dotenv()
21
+
22
+ # Set up logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+
25
+ app = FastAPI(title="Website Scraper API (Enhanced for RCS)")
26
+
27
+ # Mount static files
28
+ app.mount("/static", StaticFiles(directory="static"), name="static")
29
+
30
+ # Set up Jinja2 templates
31
+ templates = Jinja2Templates(directory="templates")
32
+
33
+ # Sample rich cards for testing
34
+ SAMPLE_RICH_CARDS = [
35
+ {
36
+ "title": "Summer Collection 2025",
37
+ "text": "Discover vibrant summer styles at Pantaloons.",
38
+ "media": "https://example.com/summer.jpg",
39
+ "url": "https://example.com/summer",
40
+ "buttons": [
41
+ {"type": "weburl", "title": "View Now", "payload": "https://example.com/summer"},
42
+ {"type": "postback", "title": "Learn More", "payload": "learn_more_1"}
43
+ ],
44
+ "quickReplies": [
45
+ {"type": "postback", "title": "Show Similar", "payload": "similar_content_1"},
46
+ {"type": "call", "title": "Contact Support", "payload": "+12345678901"}
47
+ ]
48
+ },
49
+ {
50
+ "title": "Yu Collection",
51
+ "text": "Explore trendy youth fashion with Yu Collection.",
52
+ "media": "https://example.com/yu.jpg",
53
+ "url": "https://example.com/yu_collection",
54
+ "buttons": [
55
+ {"type": "weburl", "title": "View Now", "payload": "https://example.com/yu_collection"},
56
+ {"type": "postback", "title": "Learn More", "payload": "learn_more_2"}
57
+ ],
58
+ "quickReplies": [
59
+ {"type": "postback", "title": "Show Similar", "payload": "similar_content_2"},
60
+ {"type": "call", "title": "Contact Support", "payload": "+12345678901"}
61
+ ]
62
+ }
63
+ ]
64
+
65
+ async def generate_dynamic_buttons(title: str, description: str, url: str, next_interaction: str = None) -> List[Dict]:
66
+ """Generate dynamic quick reply buttons for the next interaction using Gemini-1.5 Flash."""
67
+ try:
68
+ # Validate inputs
69
+ title = title.strip() if title and title.strip() else "News Summary"
70
+ description = description.strip() if description and description.strip() else "Explore news and insights."
71
+ url = url.strip() if url and url.strip() else "https://example.com"
72
+
73
+ logging.info(f"Generating buttons for: title={title}, description={description[:30]}..., url={url}")
74
+
75
+ # Get Gemini API key
76
+ api_key = os.getenv("GEMINI_API_KEY")
77
+ if not api_key:
78
+ logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
79
+ return [{"type": "postback", "title": "Explore More", "payload": f"goto_{next_interaction}", "execute": next_interaction}] if next_interaction else []
80
+
81
+ # Configure Gemini client
82
+ genai.configure(api_key=api_key)
83
+ model = genai.GenerativeModel('gemini-1.5-flash')
84
+
85
+ # Combine inputs (no chunking, as input is small)
86
+ input_text = f"Title: {title}\nDescription: {description}\nURL: {url}"
87
+ input_text = input_text[:500] # Truncate to 500 chars to stay within limits
88
+
89
+ # Optimized prompt
90
+ prompt = (
91
+ f"Based on the following content for the next interaction, suggest up to two concise (3-8 words) quick reply button titles that are action-oriented and invite the user to explore this content. The buttons should be engaging, relevant to the content, and avoid generic terms like 'Show Next'. Return the titles as a JSON array of strings.\n\n"
92
+ f"{input_text}\n\n"
93
+ f"Example output: [\"Discover Yu Collection\", \"Shop Youth Styles\"]\n"
94
+ f"Return only the JSON array, no markdown or extra text."
95
+ )
96
+
97
+ # Retry mechanism for API calls
98
+ max_retries = 3
99
+ for attempt in range(max_retries):
100
+ try:
101
+ response = await model.generate_content_async(prompt)
102
+ raw_content = response.text.strip()
103
+ logging.info(f"Gemini response: {raw_content}")
104
+
105
+ # Remove markdown code block markers if present
106
+ raw_content = raw_content.strip('```json').strip('```').strip()
107
+
108
+ # Parse response
109
+ button_titles = json.loads(raw_content)
110
+ if not isinstance(button_titles, list) or not all(isinstance(t, str) for t in button_titles):
111
+ logging.warning(f"Invalid Gemini response format: {raw_content}")
112
+ raise ValueError("Response is not a list of strings")
113
+
114
+ # Filter valid button titles
115
+ valid_buttons = [t.strip() for t in button_titles if t.strip() and 3 <= len(t.strip().split()) <= 8]
116
+ if not valid_buttons:
117
+ logging.warning("No valid button titles in response")
118
+ raise ValueError("No valid button titles")
119
+
120
+ # Create quick replies
121
+ quick_replies = [
122
+ {
123
+ "type": "postback",
124
+ "title": title[:20], # Ensure concise title
125
+ "payload": f"goto_{next_interaction}_{i}",
126
+ "execute": next_interaction
127
+ }
128
+ for i, title in enumerate(valid_buttons[:2])
129
+ ]
130
+ logging.info(f"Generated quick replies: {quick_replies}")
131
+ return quick_replies
132
+
133
+ except Exception as e:
134
+ logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
135
+ if attempt < max_retries - 1:
136
+ await asyncio.sleep(1) # Wait before retrying
137
+ continue
138
+
139
+ # Fallback if all retries fail
140
+ logging.error("All retries failed for button generation")
141
+ return [{"type": "postback", "title": "Explore More", "payload": f"goto_{next_interaction}", "execute": next_interaction}] if next_interaction else []
142
+
143
+ except Exception as e:
144
+ logging.error(f"Error generating dynamic buttons: {str(e)}")
145
+ return [{"type": "postback", "title": "Explore More", "payload": f"goto_{next_interaction}", "execute": next_interaction}] if next_interaction else []
146
+
147
+ async def create_nativemsg_bot(rich_cards: List[Dict], url: str, bot_name: str, api_token: str) -> Dict:
148
+ """Create a bot on NativeMSG with connected interactions based on rich cards."""
149
+ try:
150
+ # Validate API token
151
+ if not api_token:
152
+ logging.error("NativeMSG API token not provided and not found in .env file.")
153
+ raise ValueError("NativeMSG API token is required.")
154
+
155
+ # Use provided bot name or default to dynamic name
156
+ final_bot_name = bot_name or f"Bot for {urlparse(url).netloc}"
157
+
158
+ # Prepare bot payload
159
+ interactions = []
160
+
161
+ for idx, card in enumerate(rich_cards, 1):
162
+ # Determine the next interaction and next card (if any)
163
+ next_interaction = f"Interaction #{idx + 1}" if idx < len(rich_cards) else None
164
+ next_card = rich_cards[idx] if idx < len(rich_cards) else None
165
+
166
+ # Get buttons from current card
167
+ buttons = card.get("buttons", [])
168
+
169
+ # Generate dynamic quick replies based on *next* card
170
+ dynamic_quick_replies = []
171
+ if next_card:
172
+ dynamic_quick_replies = await generate_dynamic_buttons(
173
+ title=next_card.get("title", "News Summary"),
174
+ description=next_card.get("text", "Explore news and insights."),
175
+ url=next_card.get("url", ""),
176
+ next_interaction=next_interaction
177
+ )
178
+
179
+ # Define quick replies
180
+ quick_replies = dynamic_quick_replies + [
181
+ {
182
+ "type": "call",
183
+ "title": "Contact Support",
184
+ "payload": "+12345678901"
185
+ }
186
+ ]
187
+
188
+ # Build message for current card
189
+ message = {
190
+ "text": f"{card['title']}\n\n{card['text']}",
191
+ "mediaType": "image",
192
+ "media": card.get("media", "") or "https://example.com/placeholder.jpg",
193
+ "richCard": {
194
+ "cardOrientation": "VERTICAL",
195
+ "mediaHeight": "MEDIUM"
196
+ },
197
+ "buttons": buttons,
198
+ "quickReplies": quick_replies
199
+ }
200
+
201
+ # Build interaction
202
+ interaction = {
203
+ "name": f"Interaction #{idx}",
204
+ "intents": ["show_content", f"content_{idx}"],
205
+ "actions": [
206
+ {
207
+ "send": {
208
+ "message": message
209
+ },
210
+ "type": "RichCard",
211
+ "name": f"Send Rich Card #{idx}"
212
+ }
213
+ ]
214
+ }
215
+ interactions.append(interaction)
216
+
217
+ # Add welcome interaction
218
+ welcome_message = {
219
+ "text": f"Welcome to the {urlparse(url).netloc} RCS Bot! Explore the latest content.",
220
+ "richCard": {
221
+ "cardOrientation": "VERTICAL",
222
+ "mediaHeight": "MEDIUM"
223
+ },
224
+ "quickReplies": [
225
+ {
226
+ "type": "postback",
227
+ "title": "Start Exploring",
228
+ "payload": "start_exploring",
229
+ "execute": "Interaction #1"
230
+ }
231
+ ]
232
+ }
233
+ welcome_interaction = {
234
+ "name": "Welcome Interaction",
235
+ "intents": ["start", "welcome"],
236
+ "actions": [
237
+ {
238
+ "send": {
239
+ "message": welcome_message
240
+ },
241
+ "type": "RichCard",
242
+ "name": "Send Welcome Message"
243
+ }
244
+ ]
245
+ }
246
+ interactions.insert(0, welcome_interaction)
247
+
248
+ payload = {
249
+ "name": final_bot_name,
250
+ "interactions": interactions
251
+ }
252
+
253
+ # Log the payload for debugging
254
+ logging.info(f"NativeMSG bot payload: {json.dumps(payload, indent=2)}")
255
+
256
+ # Send request to NativeMSG API
257
+ connection = http.client.HTTPSConnection("api.nativemsg.com")
258
+ headers = {
259
+ "Authorization": f"Bearer {api_token}",
260
+ "Content-Type": "application/json"
261
+ }
262
+ connection.request("POST", "/v1/bots", json.dumps(payload), headers)
263
+ response = connection.getresponse()
264
+ response_data = response.read().decode('utf-8')
265
+
266
+ logging.info(f"NativeMSG bot creation response: Status {response.status}, Data: {response_data}")
267
+
268
+ if response.status != 200:
269
+ logging.error(f"Failed to create bot: {response_data}")
270
+ raise HTTPException(status_code=500, detail=f"Failed to create bot: {response_data}")
271
+
272
+ return json.loads(response_data)
273
+
274
+ except Exception as e:
275
+ logging.error(f"Error creating NativeMSG bot: {str(e)}")
276
+ raise HTTPException(status_code=500, detail=f"Failed to create bot: {str(e)}")
277
+
278
+ @app.get("/scrape")
279
+ async def crawl_website(
280
+ url: HttpUrl,
281
+ use_sample: bool = False,
282
+ bot_name: Optional[str] = None,
283
+ nativemsg_token: Optional[str] = None
284
+ ):
285
+ """Crawl a website or use sample data, generate rich cards, and create a NativeMSG bot."""
286
+ try:
287
+ # Determine API token
288
+ api_token = nativemsg_token or os.getenv("NATIVEMSG_API_TOKEN")
289
+
290
+ if use_sample:
291
+ results = SAMPLE_RICH_CARDS
292
+ else:
293
+ visited = set()
294
+ to_visit = {str(url)}
295
+ base_domain = urlparse(str(url)).netloc
296
+ results = []
297
+
298
+ while to_visit and len(visited) < 20: # Limited to 10 for demo
299
+ current_url = to_visit.pop()
300
+ if current_url in visited:
301
+ continue
302
+ visited.add(current_url)
303
+
304
+ logging.info(f"Scraping page: {current_url}")
305
+ page_data, new_links = await scrape_page(current_url, visited, base_domain)
306
+ if page_data:
307
+ logging.info(f"Scraped data: {page_data}")
308
+ summary = await quick_summarize(page_data["text"], page_data["url"])
309
+ rich_card = build_rich_card(page_data, summary)
310
+ rich_card["title"] = summary.get("title", "News Summary")
311
+ rich_card["url"] = page_data.get("url", str(url))
312
+ results.append(rich_card)
313
+
314
+ to_visit.update(new_links)
315
+ await asyncio.sleep(0.5)
316
+
317
+ if not results:
318
+ logging.error("No rich cards generated from scraping.")
319
+ raise HTTPException(status_code=400, detail="No content scraped from the provided URL.")
320
+
321
+ # Create NativeMSG bot with the rich cards
322
+ bot_response = await create_nativemsg_bot(results, str(url), bot_name, api_token)
323
+
324
+ logging.info(f"Final response: {results}, Bot: {bot_response}")
325
+ return {"rich_cards": results, "bot_response": bot_response}
326
+
327
+ except Exception as e:
328
+ logging.error(f"Scraping or bot creation failed: {str(e)}")
329
+ raise HTTPException(status_code=500, detail=f"Scraping or bot creation failed: {str(e)}")
330
+
331
+ @app.get("/", response_class=HTMLResponse)
332
+ async def serve_home(request: Request):
333
+ """Serve the frontend HTML page."""
334
+ return templates.TemplateResponse("index.html", {"request": request})
335
+
336
+ if __name__ == "__main__":
337
+ import uvicorn
338
+ uvicorn.run(app, host="0.0.0.0", port=8001)
requirements.txt ADDED
Binary file (194 Bytes). View file
 
rich_card_builder.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ # Set up logging
4
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
5
+
6
+ def select_relevant_image(images: list, text: str) -> str:
7
+ """Select a contextually relevant image from the list based on text."""
8
+ if not images:
9
+ logging.info("No images available, using empty media URL.")
10
+ return ""
11
+
12
+ # Prioritize images that are likely content-related (avoid logos, icons)
13
+ for img in images:
14
+ if not any(keyword in img.lower() for keyword in ["logo", "icon", "banner", "ad"]):
15
+ logging.info(f"Selected image: {img}")
16
+ return img
17
+
18
+ # Fallback to first image if no clear content image
19
+ logging.info(f"No content image found, using first image: {images[0]}")
20
+ return images[0]
21
+
22
+ def build_rich_card(scraped_data: dict, summary: dict) -> dict:
23
+ """Build the rich card JSON using only scraped data and summary."""
24
+ logging.info(f"Building rich card with scraped_data: {scraped_data}, summary: {summary}")
25
+
26
+ # Select relevant image
27
+ media_url = select_relevant_image(scraped_data.get("images", []), scraped_data.get("text", ""))
28
+
29
+ # Use scraped URL
30
+ page_url = scraped_data.get("url", "")
31
+
32
+ # Use summary description
33
+ description = summary.get("description", "Explore news and insights.")
34
+
35
+ rich_card = {
36
+ "targets": [{"ids": [1368], "targetType": "humans"}],
37
+ "text": description,
38
+ "mediaType": "image",
39
+ "media": media_url,
40
+ "buttons": [
41
+ {
42
+ "type": "weburl",
43
+ "title": "View Now",
44
+ "payload": page_url
45
+ },
46
+ {
47
+ "type": "postback",
48
+ "title": "Learn More",
49
+ "payload": "learn_more",
50
+ "execute": None
51
+ }
52
+ ],
53
+ "quickReplies": [
54
+ {
55
+ "type": "postback",
56
+ "title": "Show Similar",
57
+ "payload": "similar_content"
58
+ },
59
+ {
60
+ "type": "call",
61
+ "title": "Contact Support",
62
+ "payload": "+12345678901"
63
+ }
64
+ ],
65
+ "richCard": {
66
+ "cardOrientation": "VERTICAL",
67
+ "mediaHeight": "MEDIUM"
68
+ }
69
+ }
70
+ logging.info(f"Generated rich card: {rich_card}")
71
+ return rich_card
scraper.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from playwright.async_api import async_playwright
2
+ from urllib.parse import urljoin, urlparse
3
+ import logging
4
+
5
+ # Set up logging
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
+ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[dict, set]:
9
+ """Scrape a single page for text, images, and links using Playwright."""
10
+ try:
11
+ async with async_playwright() as p:
12
+ browser = await p.chromium.launch(headless=True)
13
+ context = await browser.new_context(
14
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
15
+ viewport={"width": 1280, "height": 720}
16
+ )
17
+ page = await context.new_page()
18
+ await page.goto(url, wait_until="networkidle", timeout=30000)
19
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
20
+ await page.wait_for_timeout(2000)
21
+
22
+ # Extract text content
23
+ text_content = await page.evaluate("document.body.innerText")
24
+ text_content = ' '.join(text_content.split()) if text_content else ""
25
+
26
+ # Extract images (only JPEG, PNG, WebP, exclude data URLs and SVGs)
27
+ images = await page.evaluate(
28
+ """() => {
29
+ const validExtensions = ['.jpg', '.jpeg', '.png', '.webp'];
30
+ const imgElements = document.querySelectorAll('img');
31
+ const imgUrls = new Set();
32
+ imgElements.forEach(img => {
33
+ const src = img.src || '';
34
+ const dataSrc = img.dataset.src || '';
35
+ const srcset = img.srcset || '';
36
+ // Check src
37
+ if (src && !src.startsWith('data:') && validExtensions.some(ext => src.toLowerCase().endsWith(ext))) {
38
+ imgUrls.add(src);
39
+ }
40
+ // Check data-src
41
+ if (dataSrc && !dataSrc.startsWith('data:') && validExtensions.some(ext => dataSrc.toLowerCase().endsWith(ext))) {
42
+ imgUrls.add(dataSrc);
43
+ }
44
+ // Check srcset
45
+ if (srcset) {
46
+ srcset.split(',').forEach(src => {
47
+ const url = src.trim().split(' ')[0];
48
+ if (url && !url.startsWith('data:') && validExtensions.some(ext => url.toLowerCase().endsWith(ext))) {
49
+ imgUrls.add(url);
50
+ }
51
+ });
52
+ }
53
+ });
54
+ return Array.from(imgUrls);
55
+ }"""
56
+ )
57
+ images = [urljoin(url, img) for img in images if img]
58
+
59
+ # Extract links
60
+ links = await page.evaluate("Array.from(document.querySelectorAll('a')).map(a => a.href)")
61
+ links = set(urljoin(url, link) for link in links
62
+ if urlparse(urljoin(url, link)).netloc == base_domain
63
+ and urljoin(url, link) not in visited)
64
+
65
+ await browser.close()
66
+
67
+ page_data = {"url": url, "text": text_content, "images": images}
68
+ logging.info(f"Scraped data: url={url}, text_length={len(text_content)}, images={images}")
69
+ return page_data, links
70
+
71
+ except Exception as e:
72
+ logging.error(f"Error scraping {url}: {e}")
73
+ return {}, set()
summarizer.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import Dict, Optional
4
+ import google.generativeai as genai
5
+ import logging
6
+ from dotenv import load_dotenv
7
+ from urllib.parse import urlparse
8
+ from cachetools import TTLCache
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+
16
+ # In-memory cache: 1000 items, 1-hour TTL
17
+ cache = TTLCache(maxsize=1000, ttl=3600)
18
+
19
+ async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
20
+ """Summarize text into a title and description using Gemini-1.5 Flash."""
21
+ try:
22
+ # Validate inputs
23
+ text = text.strip() if text else ""
24
+ if not url:
25
+ url = "https://example.com"
26
+ try:
27
+ parsed_url = urlparse(url)
28
+ domain = parsed_url.netloc or "example.com"
29
+ except Exception:
30
+ logging.warning(f"Invalid URL: {url}. Using default domain.")
31
+ domain = "example.com"
32
+
33
+ # Check cache
34
+ cache_key = f"summarize_{hash(text + url)}"
35
+ if cache_key in cache:
36
+ logging.info(f"Cache hit for {cache_key}")
37
+ return cache[cache_key]
38
+
39
+ # Get Gemini API key
40
+ api_key = os.getenv("GEMINI_API_KEY")
41
+ if not api_key:
42
+ logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
43
+ raise ValueError("Gemini API key is required for summarization.")
44
+
45
+ # Configure Gemini client
46
+ genai.configure(api_key=api_key)
47
+ model = genai.GenerativeModel('gemini-1.5-flash')
48
+
49
+ # Handle short or empty text
50
+ if len(text) < 20:
51
+ logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
52
+ text = f"Content from {url} about news, products, or services."
53
+
54
+ # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
55
+ chunk_size = 1000
56
+ text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
57
+ summaries = []
58
+
59
+ for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
60
+ prompt = (
61
+ f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
62
+ f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
63
+ f"and suitable for a news, product, or service context inferred from the URL ({url}). "
64
+ f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
65
+ )
66
+
67
+ response = await model.generate_content_async(prompt)
68
+ raw_content = response.text.strip()
69
+ logging.info(f"Raw Gemini response: {raw_content}")
70
+
71
+ # Parse response with regex
72
+ try:
73
+ match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
74
+ if match:
75
+ title = match.group(1)
76
+ description = match.group(2)
77
+ summaries.append({"title": title, "description": description})
78
+ else:
79
+ raise ValueError("Invalid JSON format in Gemini response")
80
+ except Exception as e:
81
+ logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
82
+ continue
83
+
84
+ # Combine summaries (prioritize first valid summary)
85
+ if summaries:
86
+ result = summaries[0]
87
+ else:
88
+ logging.warning("No valid summaries generated. Using fallback.")
89
+ result = {
90
+ "title": "News Summary",
91
+ "description": f"Discover news and insights from {domain}."[:100]
92
+ }
93
+
94
+ # Ensure non-empty outputs
95
+ if not result["title"].strip():
96
+ result["title"] = "News Summary"
97
+ if not result["description"].strip():
98
+ result["description"] = f"Discover news and insights from {domain}."[:100]
99
+
100
+ cache[cache_key] = result
101
+ logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
102
+ return result
103
+
104
+ except Exception as e:
105
+ logging.error(f"Error summarizing text: {e}")
106
+ domain = urlparse(url).netloc or "example.com"
107
+ result = {
108
+ "title": "News Summary",
109
+ "description": f"Discover news and insights from {domain}."[:100]
110
+ }
111
+ cache[cache_key] = result
112
+ return result
113
+
114
+ async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
115
+ """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
116
+ try:
117
+ # Validate inputs
118
+ text = text.strip() if text else ""
119
+ if not url:
120
+ url = "https://example.com"
121
+ try:
122
+ parsed_url = urlparse(url)
123
+ domain = parsed_url.netloc or "example.com"
124
+ except Exception:
125
+ logging.warning(f"Invalid URL: {url}. Using default domain.")
126
+ domain = "example.com"
127
+
128
+ # Check cache
129
+ cache_key = f"quick_summarize_{hash(text + url)}"
130
+ if cache_key in cache:
131
+ logging.info(f"Cache hit for {cache_key}")
132
+ return cache[cache_key]
133
+
134
+ # Get Gemini API key
135
+ api_key = os.getenv("GEMINI_API_KEY")
136
+ if not api_key:
137
+ logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
138
+ raise ValueError("Gemini API key is required for summarization.")
139
+
140
+ # Configure Gemini client
141
+ genai.configure(api_key=api_key)
142
+ model = genai.GenerativeModel('gemini-1.5-flash')
143
+
144
+ # Handle short or empty text
145
+ if len(text) < 20:
146
+ logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
147
+ text = f"Content from {url} about news, products, or services."
148
+
149
+ # Lightweight prompt with chunking
150
+ chunk_size = 1000
151
+ text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
152
+ summaries = []
153
+
154
+ for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
155
+ prompt = (
156
+ f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
157
+ f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
158
+ )
159
+
160
+ response = await model.generate_content_async(prompt)
161
+ raw_content = response.text.strip()
162
+ logging.info(f"Raw Gemini response (quick): {raw_content}")
163
+
164
+ # Parse response with regex
165
+ try:
166
+ match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
167
+ if match:
168
+ title = match.group(1)[:50]
169
+ description = match.group(2)[:100]
170
+ summaries.append({"title": title, "description": description})
171
+ else:
172
+ raise ValueError("Invalid JSON format in Gemini response")
173
+ except Exception as e:
174
+ logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
175
+ continue
176
+
177
+ # Use first valid summary or fallback
178
+ if summaries:
179
+ result = summaries[0]
180
+ else:
181
+ logging.warning("No valid summaries generated. Using fallback.")
182
+ result = {
183
+ "title": "Quick Summary",
184
+ "description": f"Check out content from {domain}."[:100]
185
+ }
186
+
187
+ # Ensure non-empty outputs
188
+ if not result["title"].strip():
189
+ result["title"] = "Quick Summary"
190
+ if not result["description"].strip():
191
+ result["description"] = f"Check out content from {domain}."[:100]
192
+
193
+ cache[cache_key] = result
194
+ logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
195
+ return result
196
+
197
+ except Exception as e:
198
+ logging.error(f"Error in quick summarize: {e}")
199
+ domain = urlparse(url).netloc or "example.com"
200
+ result = {
201
+ "title": "Quick Summary",
202
+ "description": f"Check out content from {domain}."[:100]
203
+ }
204
+ cache[cache_key] = result
205
+ return result