File size: 18,938 Bytes
61baa85
6775be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61baa85
6775be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec6fc01
6775be9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec6fc01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6775be9
 
ec6fc01
6775be9
ec6fc01
6775be9
ec6fc01
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
import gradio as gr
import asyncio
import aiohttp
import logging
import math
import io
import numpy as np
from newspaper import Article
import PyPDF2
from collections import Counter
import json
from datetime import datetime
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from sentence_transformers.util import pytorch_cos_sim
from enum import Enum
from groq import Groq
import os
from typing import List, Dict, Any, Set
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize Groq client
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

class ScoringMethod(Enum):
    BM25 = "bm25"
    TFIDF = "tfidf"
    COMBINED = "combined"

async def get_available_engines(session, base_url, headers):
    """Fetch available search engines from SearxNG instance."""
    try:
        # First try the search endpoint to get engines
        params = {
            "q": "test",
            "format": "json",
            "engines": "all"
        }
        async with session.get(f"{base_url}/search", headers=headers, params=params) as response:
            data = await response.json()
            available_engines = set()
            # Extract unique engine names from the response
            if "search" in data:
                for engine_data in data["search"]:
                    if isinstance(engine_data, dict) and "engine" in engine_data:
                        available_engines.add(engine_data["engine"])
            
            # If no engines found, try alternate endpoint
            if not available_engines:
                async with session.get(f"{base_url}/engines", headers=headers) as response:
                    engines_data = await response.json()
                    available_engines = set(engine["name"] for engine in engines_data if engine.get("enabled", True))
            
            return list(available_engines)
    except Exception as e:
        logging.error(f'Error fetching search engines: {e}')
        # Return default engines if unable to fetch
        return ["google", "bing", "duckduckgo", "brave", "wikipedia"]

def select_search_engines(available_engines: List[str]) -> Set[str]:
    """Let user select search engines from available options."""
    print("\nAvailable search engines:")
    engines_list = sorted(available_engines)
    for i, engine in enumerate(engines_list, 1):
        print(f"{i}. {engine}")
    
    print("\nEnter the numbers of engines you want to use (comma-separated), or 'all' for all engines:")
    selection = input("Your selection: ").strip().lower()
    
    if selection == 'all':
        return set(engines_list)
    
    try:
        selected_indices = [int(idx.strip()) - 1 for idx in selection.split(',')]
        return {engines_list[idx] for idx in selected_indices if 0 <= idx < len(engines_list)}
    except (ValueError, IndexError):
        logging.error("Invalid selection, using all engines as fallback")
        return set(engines_list)


logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

async def scrape_url(url, max_chars):
    logging.info(f'Scraping URL: {url}')
    if url.endswith(".pdf"):
        return await scrape_pdf(url, max_chars)
    else:
        return await scrape_html(url, max_chars)

async def scrape_html(url, max_chars):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text[:max_chars]
        publish_date = article.publish_date
        logging.info(f'Scraped HTML content from {url}')
        return {"content": text, "publish_date": publish_date.isoformat() if publish_date else None}
    except Exception as e:
        logging.error(f'Error scraping HTML content from {url}: {e}')
        return None

async def scrape_pdf(url, max_chars):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                pdf_bytes = await response.read()
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        text = text[:max_chars]
        logging.info(f'Scraped PDF content from {url}')
        return {"content": text, "publish_date": None}
    except Exception as e:
        logging.error(f'Error scraping PDF content from {url}: {e}')
        return None

def normalize_scores(scores):
    """Normalize scores to [0, 1] range using min-max normalization"""
    if not isinstance(scores, np.ndarray):
        scores = np.array(scores)
    
    if len(scores) == 0:
        return []
    
    min_score = np.min(scores)
    max_score = np.max(scores)
    
    if max_score - min_score > 0:
        normalized = (scores - min_score) / (max_score - min_score)
    else:
        normalized = np.ones_like(scores)
    
    return normalized.tolist()

async def calculate_bm25(query, documents):
    """Calculate BM25 scores for documents."""
    try:
        if not documents:
            return []
            
        bm25 = BM25Okapi([doc.split() for doc in documents])
        scores = bm25.get_scores(query.split())
        return normalize_scores(scores)
        
    except Exception as e:
        logging.error(f'Error calculating BM25 scores: {e}')
        return [0] * len(documents)

async def calculate_tfidf(query, documents, measure="cosine"):
    """Calculate TF-IDF based similarity scores."""
    try:
        if not documents:
            return []
            
        model = SentenceTransformer('all-MiniLM-L6-v2')
        query_embedding = model.encode(query)
        document_embeddings = model.encode(documents)
        
        # Normalize embeddings
        query_embedding = query_embedding / np.linalg.norm(query_embedding)
        document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1)[:, np.newaxis]

        if measure == "cosine":
            # Calculate cosine similarity
            scores = np.dot(document_embeddings, query_embedding)
            return normalize_scores(scores)
        else:
            raise ValueError("Unsupported similarity measure.")
            
    except Exception as e:
        logging.error(f'Error calculating TF-IDF scores: {e}')
        return [0] * len(documents)

def combine_scores(bm25_score, tfidf_score, weights=(0.5, 0.5)):
    """Combine scores using weighted average."""
    return weights[0] * bm25_score + weights[1] * tfidf_score

async def get_document_scores(query, documents, scoring_method: ScoringMethod):
    """Calculate document scores based on the chosen scoring method."""
    if not documents:
        return []
        
    if scoring_method == ScoringMethod.BM25:
        scores = await calculate_bm25(query, documents)
        return [(score, 0) for score in scores]
    elif scoring_method == ScoringMethod.TFIDF:
        scores = await calculate_tfidf(query, documents)
        return [(0, score) for score in scores]
    else:  # COMBINED
        bm25_scores = await calculate_bm25(query, documents)
        tfidf_scores = await calculate_tfidf(query, documents)
        return list(zip(bm25_scores, tfidf_scores))

def get_total_score(scores, scoring_method: ScoringMethod):
    """Calculate total score based on the scoring method."""
    bm25_score, tfidf_score = scores
    if scoring_method == ScoringMethod.BM25:
        return bm25_score
    elif scoring_method == ScoringMethod.TFIDF:
        return tfidf_score
    else:  # COMBINED
        return combine_scores(bm25_score, tfidf_score)

async def generate_summary(query: str, articles: List[Dict[str, Any]], temperature: float = 0.7) -> str:
    """
    Generate a summary of the articles using Groq's LLama 3.1 8b model.
    """
    try:
        # Format the articles into a structured JSON string
        json_input = json.dumps(articles, indent=2)
        
        system_prompt = """You are Sentinel, a world-class AI model who is expert at searching the web and answering user's queries. You are also an expert at summarizing web pages or documents and searching for content in them."""
        
        user_prompt = f"""
Please provide a comprehensive summary based on the following JSON input:
{json_input}

Original Query: {query}

Instructions:
1. Analyze the query and the provided documents.
2. Write a detailed, long, and complete research document that is informative and relevant to the user's query based on provided context.
3. Use this context to answer the user's query in the best way possible. Use an unbiased and journalistic tone.
4. Use an unbiased and professional tone in your response.
5. Do not repeat text verbatim from the input.
6. Provide the answer in the response itself.
7. Use markdown to format your response.
8. Use bullet points to list information where appropriate.
9. Cite the answer using [number] notation along with the appropriate source URL embedded in the notation.
10. Place these citations at the end of the relevant sentences.
11. You can cite the same sentence multiple times if it's relevant.
12. Make sure the answer is not short and is informative.
13. Your response should be detailed, informative, accurate, and directly relevant to the user's query."""

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        response = groq_client.chat.completions.create(
            messages=messages,
            model="llama-3.1-70b-versatile",  # Using LLama 3.1 8b model
            max_tokens=5000,
            temperature=temperature,
            top_p=0.9,
            presence_penalty=1.2,
            stream=False
        )
        
        return response.choices[0].message.content.strip()
        
    except Exception as e:
        logging.error(f'Error generating summary: {e}')
        return f"Error generating summary: {str(e)}"

class ChatBot:
    def __init__(self):
        self.scoring_method = ScoringMethod.COMBINED
        self.num_results = 10
        self.max_chars = 10000
        self.score_threshold = 0.8
        self.temperature = 0.1
        self.history = []
        self.base_url = "http://localhost:8888"
        self.headers = {
            "X-Searx-API-Key": "f9f07f93b37b8483aadb5ba717f556f3a4ac507b281b4ca01e6c6288aa3e3ae5"
        }
        self.default_engines = ["google", "bing", "duckduckgo", "brave"]

    async def get_search_results(self, 
                               query: str,
                               num_results: int,
                               max_chars: int,
                               score_threshold: float,
                               temperature: float,
                               scoring_method_str: str,
                               selected_engines: List[str]) -> str:
        try:
            # Convert scoring method string to enum
            scoring_method_map = {
                "BM25": ScoringMethod.BM25,
                "TF-IDF": ScoringMethod.TFIDF,
                "Combined": ScoringMethod.COMBINED
            }
            self.scoring_method = scoring_method_map[scoring_method_str]

            async with aiohttp.ClientSession() as session:
                # Use the selected engines from the interface
                logging.info(f'Using engines: {", ".join(selected_engines)}')
                logging.info(f'Parameters: Results={num_results}, Chars={max_chars}, Threshold={score_threshold}, Temp={temperature}, Method={scoring_method_str}')
                
                # Perform search
                params = {
                    "q": query,
                    "format": "json",
                    "engines": ",".join(selected_engines),
                    "limit": num_results
                }
                
                try:
                    async with session.get(f"{self.base_url}/search", headers=self.headers, params=params) as response:
                        data = await response.json()
                except Exception as e:
                    return f"Error: Could not connect to search service. Please check if SearxNG is running at {self.base_url}. Error: {str(e)}"

                if "results" not in data or not data["results"]:
                    return "No results found."

                results = data["results"][:num_results]
                tasks = [scrape_url(result["url"], max_chars) for result in results]
                scraped_data = await asyncio.gather(*tasks)

                valid_results = [(result, article) 
                                for result, article in zip(results, scraped_data) 
                                if article is not None]
                
                if not valid_results:
                    return "No valid articles found after scraping."

                results, scraped_data = zip(*valid_results)
                contents = [article["content"] for article in scraped_data]
                
                scores = await get_document_scores(query, contents, self.scoring_method)

                scored_articles = []
                for i, (score_tuple, article) in enumerate(zip(scores, scraped_data)):
                    total_score = get_total_score(score_tuple, self.scoring_method)
                    if total_score >= self.score_threshold:
                        scored_articles.append({
                            "url": results[i]["url"],
                            "title": results[i]["title"],
                            "content": article["content"],
                            "publish_date": article["publish_date"],
                            "score": round(total_score, 4),
                            "bm25_score": round(score_tuple[0], 4),
                            "tfidf_score": round(score_tuple[1], 4),
                            "engine": results[i].get("engine", "unknown")
                        })

                scored_articles.sort(key=lambda x: x["score"], reverse=True)
                unique_articles = []
                seen_content = set()
                
                for article in scored_articles:
                    if article["content"] not in seen_content:
                        seen_content.add(article["content"])
                        unique_articles.append(article)

                # Generate summary using Groq API
                summary = await generate_summary(query, unique_articles, self.temperature)

                # Format the response for chat
                response = f"**Search Parameters:**\n"
                response += f"- Results: {num_results}\n"
                response += f"- Max Characters: {max_chars}\n"
                response += f"- Score Threshold: {score_threshold}\n"
                response += f"- Temperature: {temperature}\n"
                response += f"- Scoring Method: {scoring_method_str}\n"
                response += f"- Search Engines: {', '.join(selected_engines)}\n\n"
                response += f"**Summary of Search Results:**\n\n{summary}\n\n"
                response += "\n**Sources:**\n"
                for i, article in enumerate(unique_articles, 1):
                    response += f"{i}. [{article['title']}]({article['url']}) (Score: {article['score']})\n"
                
                return response

        except Exception as e:
            logging.error(f'Error in search_and_summarize: {e}')
            return f"Error occurred: {str(e)}"

    def chat(self, 
             message: str, 
             history: List[List[str]], 
             num_results: int,
             max_chars: int,
             score_threshold: float,
             temperature: float,
             scoring_method: str,
             engines: List[str]) -> str:
        """
        Process chat messages and return responses with custom parameters.
        """
        # Run the async search function in the sync context
        response = asyncio.run(self.get_search_results(
            message,
            num_results,
            max_chars,
            score_threshold,
            temperature,
            scoring_method,
            engines
        ))
        return response

def create_gradio_interface() -> gr.Interface:
    chatbot = ChatBot()
    
    # Create the interface with advanced styling
    iface = gr.ChatInterface(
        chatbot.chat,
        title="Web Scraper for News with Sentinel AI",
        description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
        theme=gr.Theme.from_hub("allenai/gradio-theme"),
        additional_inputs=[
            gr.Slider(minimum=5, maximum=30, value=10, step=1, label="Number of Results"),
            gr.Slider(minimum=1000, maximum=50000, value=10000, step=1000, label="Max Characters per Article"),
            gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.05, label="Score Threshold"),
            gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.05, label="Temperature"),
            gr.Radio(["BM25", "TF-IDF", "Combined"], value="Combined", label="Scoring Method"),
            gr.CheckboxGroup(
                choices=["google", "bing", "duckduckgo", "brave", "wikipedia"],
                value=["google", "bing", "duckduckgo"],
                label="Search Engines"
            )
        ],
        additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
        retry_btn="Retry",
        undo_btn="Undo",
        clear_btn="Clear",
        chatbot=gr.Chatbot(
            show_copy_button=True,
            likeable=True,
            layout="bubble",
            height=500,
        )
    )
    
    return iface

def create_parameter_description():
    return """
    ### Parameter Descriptions
    
    - **Number of Results**: Number of search results to fetch
    - **Max Characters**: Maximum characters to analyze per article
    - **Score Threshold**: Minimum relevance score (0-1) for including articles
    - **Temperature**: Controls creativity in summary generation (0=focused, 1=creative)
    - **Scoring Method**: Algorithm for ranking article relevance
        - BM25: Traditional keyword-based ranking
        - TF-IDF: Semantic similarity-based ranking
        - Combined: Balanced approach using both methods
    - **Search Engines**: Select which search engines to use
    """

if __name__ == "__main__":
    iface = create_gradio_interface()
    
    # Create the layout with two columns
    with gr.Blocks(theme=gr.Theme.from_hub("allenai/gradio-theme")) as demo:
        with gr.Row():
            with gr.Column(scale=3):
                iface.render()
            with gr.Column(scale=1):
                gr.Markdown(create_parameter_description())
    
    # Launch the interface
    demo.launch(server_name="0.0.0.0", server_port=7862, share=True)