AiDeveloper1 commited on
Commit
5c7501f
·
verified ·
1 Parent(s): 0bcc68d

Update summarizer.py

Browse files
Files changed (1) hide show
  1. summarizer.py +204 -204
summarizer.py CHANGED
@@ -1,205 +1,205 @@
1
- import os
2
- import re
3
- from typing import Dict, Optional
4
- import google.generativeai as genai
5
- import logging
6
- from dotenv import load_dotenv
7
- from urllib.parse import urlparse
8
- from cachetools import TTLCache
9
-
10
- # Load environment variables
11
- load_dotenv()
12
-
13
- # Set up logging
14
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
-
16
- # In-memory cache: 1000 items, 1-hour TTL
17
- cache = TTLCache(maxsize=1000, ttl=3600)
18
-
19
- async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
20
- """Summarize text into a title and description using Gemini-1.5 Flash."""
21
- try:
22
- # Validate inputs
23
- text = text.strip() if text else ""
24
- if not url:
25
- url = "https://example.com"
26
- try:
27
- parsed_url = urlparse(url)
28
- domain = parsed_url.netloc or "example.com"
29
- except Exception:
30
- logging.warning(f"Invalid URL: {url}. Using default domain.")
31
- domain = "example.com"
32
-
33
- # Check cache
34
- cache_key = f"summarize_{hash(text + url)}"
35
- if cache_key in cache:
36
- logging.info(f"Cache hit for {cache_key}")
37
- return cache[cache_key]
38
-
39
- # Get Gemini API key
40
- api_key = os.getenv("GEMINI_API_KEY")
41
- if not api_key:
42
- logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
43
- raise ValueError("Gemini API key is required for summarization.")
44
-
45
- # Configure Gemini client
46
- genai.configure(api_key=api_key)
47
- model = genai.GenerativeModel('gemini-1.5-flash')
48
-
49
- # Handle short or empty text
50
- if len(text) < 20:
51
- logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
52
- text = f"Content from {url} about news, products, or services."
53
-
54
- # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
55
- chunk_size = 1000
56
- text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
57
- summaries = []
58
-
59
- for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
60
- prompt = (
61
- f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
62
- f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
63
- f"and suitable for a news, product, or service context inferred from the URL ({url}). "
64
- f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
65
- )
66
-
67
- response = await model.generate_content_async(prompt)
68
- raw_content = response.text.strip()
69
- logging.info(f"Raw Gemini response: {raw_content}")
70
-
71
- # Parse response with regex
72
- try:
73
- match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
74
- if match:
75
- title = match.group(1)
76
- description = match.group(2)
77
- summaries.append({"title": title, "description": description})
78
- else:
79
- raise ValueError("Invalid JSON format in Gemini response")
80
- except Exception as e:
81
- logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
82
- continue
83
-
84
- # Combine summaries (prioritize first valid summary)
85
- if summaries:
86
- result = summaries[0]
87
- else:
88
- logging.warning("No valid summaries generated. Using fallback.")
89
- result = {
90
- "title": "News Summary",
91
- "description": f"Discover news and insights from {domain}."[:100]
92
- }
93
-
94
- # Ensure non-empty outputs
95
- if not result["title"].strip():
96
- result["title"] = "News Summary"
97
- if not result["description"].strip():
98
- result["description"] = f"Discover news and insights from {domain}."[:100]
99
-
100
- cache[cache_key] = result
101
- logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
102
- return result
103
-
104
- except Exception as e:
105
- logging.error(f"Error summarizing text: {e}")
106
- domain = urlparse(url).netloc or "example.com"
107
- result = {
108
- "title": "News Summary",
109
- "description": f"Discover news and insights from {domain}."[:100]
110
- }
111
- cache[cache_key] = result
112
- return result
113
-
114
- async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
115
- """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
116
- try:
117
- # Validate inputs
118
- text = text.strip() if text else ""
119
- if not url:
120
- url = "https://example.com"
121
- try:
122
- parsed_url = urlparse(url)
123
- domain = parsed_url.netloc or "example.com"
124
- except Exception:
125
- logging.warning(f"Invalid URL: {url}. Using default domain.")
126
- domain = "example.com"
127
-
128
- # Check cache
129
- cache_key = f"quick_summarize_{hash(text + url)}"
130
- if cache_key in cache:
131
- logging.info(f"Cache hit for {cache_key}")
132
- return cache[cache_key]
133
-
134
- # Get Gemini API key
135
- api_key = os.getenv("GEMINI_API_KEY")
136
- if not api_key:
137
- logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
138
- raise ValueError("Gemini API key is required for summarization.")
139
-
140
- # Configure Gemini client
141
- genai.configure(api_key=api_key)
142
- model = genai.GenerativeModel('gemini-1.5-flash')
143
-
144
- # Handle short or empty text
145
- if len(text) < 20:
146
- logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
147
- text = f"Content from {url} about news, products, or services."
148
-
149
- # Lightweight prompt with chunking
150
- chunk_size = 1000
151
- text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
152
- summaries = []
153
-
154
- for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
155
- prompt = (
156
- f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
157
- f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
158
- )
159
-
160
- response = await model.generate_content_async(prompt)
161
- raw_content = response.text.strip()
162
- logging.info(f"Raw Gemini response (quick): {raw_content}")
163
-
164
- # Parse response with regex
165
- try:
166
- match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
167
- if match:
168
- title = match.group(1)[:50]
169
- description = match.group(2)[:100]
170
- summaries.append({"title": title, "description": description})
171
- else:
172
- raise ValueError("Invalid JSON format in Gemini response")
173
- except Exception as e:
174
- logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
175
- continue
176
-
177
- # Use first valid summary or fallback
178
- if summaries:
179
- result = summaries[0]
180
- else:
181
- logging.warning("No valid summaries generated. Using fallback.")
182
- result = {
183
- "title": "Quick Summary",
184
- "description": f"Check out content from {domain}."[:100]
185
- }
186
-
187
- # Ensure non-empty outputs
188
- if not result["title"].strip():
189
- result["title"] = "Quick Summary"
190
- if not result["description"].strip():
191
- result["description"] = f"Check out content from {domain}."[:100]
192
-
193
- cache[cache_key] = result
194
- logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
195
- return result
196
-
197
- except Exception as e:
198
- logging.error(f"Error in quick summarize: {e}")
199
- domain = urlparse(url).netloc or "example.com"
200
- result = {
201
- "title": "Quick Summary",
202
- "description": f"Check out content from {domain}."[:100]
203
- }
204
- cache[cache_key] = result
205
  return result
 
1
+ import os
2
+ import re
3
+ from typing import Dict, Optional
4
+ import google.generativeai as genai
5
+ import logging
6
+ from dotenv import load_dotenv
7
+ from urllib.parse import urlparse
8
+ from cachetools import TTLCache
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+
16
+ # In-memory cache: 1000 items, 1-hour TTL
17
+ cache = TTLCache(maxsize=1000, ttl=3600)
18
+
19
+ async def summarize_text(text: str, url: str = "") -> Dict[str, str]:
20
+ """Summarize text into a title and description using Gemini-1.5 Flash."""
21
+ try:
22
+ # Validate inputs
23
+ text = text.strip() if text else ""
24
+ if not url:
25
+ url = "https://example.com"
26
+ try:
27
+ parsed_url = urlparse(url)
28
+ domain = parsed_url.netloc or "example.com"
29
+ except Exception:
30
+ logging.warning(f"Invalid URL: {url}. Using default domain.")
31
+ domain = "example.com"
32
+
33
+ # Check cache
34
+ cache_key = f"summarize_{hash(text + url)}"
35
+ if cache_key in cache:
36
+ logging.info(f"Cache hit for {cache_key}")
37
+ return cache[cache_key]
38
+
39
+ # Get Gemini API key
40
+ api_key = os.getenv("GEMINI_API_KEY")
41
+ if not api_key:
42
+ logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
43
+ raise ValueError("Gemini API key is required for summarization.")
44
+
45
+ # Configure Gemini client
46
+ genai.configure(api_key=api_key)
47
+ model = genai.GenerativeModel('gemini-1.5-flash')
48
+
49
+ # Handle short or empty text
50
+ if len(text) < 20:
51
+ logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
52
+ text = f"Content from {url} about news, products, or services."
53
+
54
+ # Split text into chunks to avoid quota issues (e.g., 1000 chars per chunk)
55
+ chunk_size = 1000
56
+ text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
57
+ summaries = []
58
+
59
+ for chunk in text_chunks[:2]: # Limit to first 2000 chars for efficiency
60
+ prompt = (
61
+ f"Summarize the following text into a title (30-50 characters) and a description (80-100 characters) "
62
+ f"for RCS messaging. Ensure titles are catchy and descriptions are engaging, relevant to the content, "
63
+ f"and suitable for a news, product, or service context inferred from the URL ({url}). "
64
+ f"Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
65
+ )
66
+
67
+ response = await model.generate_content_async(prompt)
68
+ raw_content = response.text.strip()
69
+ logging.info(f"Raw Gemini response: {raw_content}")
70
+
71
+ # Parse response with regex
72
+ try:
73
+ match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
74
+ if match:
75
+ title = match.group(1)
76
+ description = match.group(2)
77
+ summaries.append({"title": title, "description": description})
78
+ else:
79
+ raise ValueError("Invalid JSON format in Gemini response")
80
+ except Exception as e:
81
+ logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
82
+ continue
83
+
84
+ # Combine summaries (prioritize first valid summary)
85
+ if summaries:
86
+ result = summaries[0]
87
+ else:
88
+ logging.warning("No valid summaries generated. Using fallback.")
89
+ result = {
90
+ "title": "News Summary",
91
+ "description": f"Discover news and insights from {domain}."[:100]
92
+ }
93
+
94
+ # Ensure non-empty outputs
95
+ if not result["title"].strip():
96
+ result["title"] = "News Summary"
97
+ if not result["description"].strip():
98
+ result["description"] = f"Discover news and insights from {domain}."
99
+
100
+ cache[cache_key] = result
101
+ logging.info(f"Summary - Title: {result['title']}, Description: {result['description']}")
102
+ return result
103
+
104
+ except Exception as e:
105
+ logging.error(f"Error summarizing text: {e}")
106
+ domain = urlparse(url).netloc or "example.com"
107
+ result = {
108
+ "title": "News Summary",
109
+ "description": f"Discover news and insights from {domain}."[:100]
110
+ }
111
+ cache[cache_key] = result
112
+ return result
113
+
114
+ async def quick_summarize(text: str, url: str = "") -> Dict[str, str]:
115
+ """Quickly summarize text with a lightweight prompt using Gemini-1.5 Flash."""
116
+ try:
117
+ # Validate inputs
118
+ text = text.strip() if text else ""
119
+ if not url:
120
+ url = "https://example.com"
121
+ try:
122
+ parsed_url = urlparse(url)
123
+ domain = parsed_url.netloc or "example.com"
124
+ except Exception:
125
+ logging.warning(f"Invalid URL: {url}. Using default domain.")
126
+ domain = "example.com"
127
+
128
+ # Check cache
129
+ cache_key = f"quick_summarize_{hash(text + url)}"
130
+ if cache_key in cache:
131
+ logging.info(f"Cache hit for {cache_key}")
132
+ return cache[cache_key]
133
+
134
+ # Get Gemini API key
135
+ api_key = os.getenv("GEMINI_API_KEY")
136
+ if not api_key:
137
+ logging.error("Gemini API key not found. Please set GEMINI_API_KEY in .env file.")
138
+ raise ValueError("Gemini API key is required for summarization.")
139
+
140
+ # Configure Gemini client
141
+ genai.configure(api_key=api_key)
142
+ model = genai.GenerativeModel('gemini-1.5-pro')
143
+
144
+ # Handle short or empty text
145
+ if len(text) < 20:
146
+ logging.warning(f"Text too short ({len(text)} chars): '{text}'. Using URL context.")
147
+ text = f"Content from {url} about news, products, or services."
148
+
149
+ # Lightweight prompt with chunking
150
+ chunk_size = 1000
151
+ text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
152
+ summaries = []
153
+
154
+ for chunk in text_chunks[:1]: # Limit to first 1000 chars for quick summary
155
+ prompt = (
156
+ f"Create a title (30-50 chars) and description (80-100 chars) for RCS messaging from this text. "
157
+ f"Keep it engaging and relevant to {url}. Output as JSON:\n{{\"title\": \"[title]\", \"description\": \"[description]\"}}\n\nText: {chunk}"
158
+ )
159
+
160
+ response = await model.generate_content_async(prompt)
161
+ raw_content = response.text.strip()
162
+ logging.info(f"Raw Gemini response (quick): {raw_content}")
163
+
164
+ # Parse response with regex
165
+ try:
166
+ match = re.search(r'\{[\s\S]*"title":\s*"([^"]+)"[\s\S]*"description":\s*"([^"]+)"[\s\S]*\}', raw_content)
167
+ if match:
168
+ title = match.group(1)
169
+ description = match.group(2)
170
+ summaries.append({"title": title, "description": description})
171
+ else:
172
+ raise ValueError("Invalid JSON format in Gemini response")
173
+ except Exception as e:
174
+ logging.warning(f"Failed to parse Gemini response: {e}. Skipping chunk.")
175
+ continue
176
+
177
+ # Use first valid summary or fallback
178
+ if summaries:
179
+ result = summaries[0]
180
+ else:
181
+ logging.warning("No valid summaries generated. Using fallback.")
182
+ result = {
183
+ "title": "Quick Summary",
184
+ "description": f"Check out content from {domain}."
185
+ }
186
+
187
+ # Ensure non-empty outputs
188
+ if not result["title"].strip():
189
+ result["title"] = "Quick Summary"
190
+ if not result["description"].strip():
191
+ result["description"] = f"Check out content from {domain}."
192
+
193
+ cache[cache_key] = result
194
+ logging.info(f"Quick summary - Title: {result['title']}, Description: {result['description']}")
195
+ return result
196
+
197
+ except Exception as e:
198
+ logging.error(f"Error in quick summarize: {e}")
199
+ domain = urlparse(url).netloc or "example.com"
200
+ result = {
201
+ "title": "Quick Summary",
202
+ "description": f"Check out content from {domain}."
203
+ }
204
+ cache[cache_key] = result
205
  return result