from flask import Flask, request, render_template_string, Response, jsonify from openai import OpenAI import os import json from urllib.parse import quote, urljoin import html import requests from bs4 import BeautifulSoup app = Flask(__name__) # Initialize OpenAI client with API key and base URL from environment variables client = OpenAI( api_key=os.environ["OPENAI_API_KEY"], base_url=os.environ["OPENAI_BASE_URL"] ) # Define constants for pagination RESULTS_PER_PAGE = 10 TOTAL_RESULTS = 40 # Increased to 40 for 4 pages def fetch_search_results(query): """Fetch search results from the LLM without streaming, minimizing hallucinations.""" if not query.strip(): return None, "Please enter a search query." prompt = f""" You are a highly accurate search engine designed to provide reliable, factual, and verifiable results. For the given query '{query}', generate {TOTAL_RESULTS} search results. Each result must include: - 'title': A concise, accurate title directly relevant to the query, reflecting real-world content. - 'snippet': A short, factual summary (2-3 sentences) based strictly on real-world knowledge, avoiding any speculation. - 'url': A valid, existing URL from well-known, real websites or domains that you are certain exist based on your training data (e.g., wikipedia.org, python.org, nytimes.com, stackoverflow.com). Strictly prohibit any fictional, random, or fabricated URLs; only include URLs you can verify as real and accessible. Absolutely no made-up domains or paths are allowed—use only URLs tied to actual entities or resources. Format the response as a JSON array of objects, where each object has 'title', 'snippet', and 'url' fields. Ensure the results are diverse, directly relevant to the query, and free of any hallucinated or fabricated content. Before including a URL, verify it matches a real website or resource you know exists from your training data. """ try: response = client.chat.completions.create( model="gemini-2.0-flash-lite", # Adjust model name as needed messages=[ {"role": "system", "content": "You are a helpful search engine."}, {"role": "user", "content": prompt} ], response_format={"type": "json_object"} ) content = response.choices[0].message.content results = json.loads(content) # Handle different possible JSON structures if isinstance(results, dict) and "results" in results: results = results["results"] elif isinstance(results, list): pass else: return None, "Error: Unexpected JSON structure." return results, None except Exception as e: error_msg = str(e) if "404" in error_msg: return None, f"Error 404: Model or endpoint not found. Check OPENAI_BASE_URL ({os.environ['OPENAI_BASE_URL']}) and model name." elif "401" in error_msg: return None, "Error 401: Invalid API key. Check OPENAI_API_KEY." else: return None, f"Error: {error_msg}" @app.route('/check-url', methods=['GET']) def check_url(): """Check if a URL is valid (returns 200) or broken, and fetch favicon if valid.""" url = request.args.get('url', '') if not url: return jsonify({'broken': True, 'favicon': None}) try: # Follow redirects and check final status response = requests.head(url, allow_redirects=True, timeout=10) if response.status_code == 200: # Fetch favicon for valid URLs html_response = requests.get(url, timeout=10) soup = BeautifulSoup(html_response.text, 'html.parser') favicon_tag = soup.find("link", rel=["icon", "shortcut icon"]) favicon_url = favicon_tag['href'] if favicon_tag and 'href' in favicon_tag.attrs else None if favicon_url and not favicon_url.startswith('http'): favicon_url = urljoin(url, favicon_url) # Resolve relative URLs return jsonify({'broken': False, 'favicon': favicon_url or '🌐'}) else: return jsonify({'broken': True, 'favicon': None}) except requests.RequestException: return jsonify({'broken': True, 'favicon': None}) @app.route('/', methods=['GET']) def search_page(): """Serve the initial page or process search with a progress bar and URL validation.""" query = request.args.get('query', '') page = request.args.get('page', '1') btn = request.args.get('btn', 'LLM Search') try: page = int(page) except ValueError: page = 1 # Initial page (no query yet) if not query.strip(): html_content = """ LLM Search Engine
""" return render_template_string(html_content) # Fetch results after showing progress bar results, error = fetch_search_results(query) if error: html_content = f""" LLM Search Engine

{error}

""" return render_template_string(html_content) # "I'm Feeling Lucky" redirects to the first URL if btn == "I'm Feeling Lucky": first_url = results[0].get("url", "#") if results else "#" return Response(f"""

Redirecting to {first_url}...

""", mimetype="text/html") # Calculate pagination for "LLM Search" start_idx = (page - 1) * RESULTS_PER_PAGE end_idx = min(start_idx + RESULTS_PER_PAGE, len(results)) total_pages = (len(results) + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE if start_idx >= len(results): html_content = f""" LLM Search Engine

No more results to display.

""" return render_template_string(html_content) # Generate full results page for "LLM Search" paginated_results = results[start_idx:end_idx] html_content = f""" LLM Search Engine

Results for '{html.escape(query)}' (Page {page} of {total_pages})

""" for result in paginated_results: title = html.escape(result.get("title", "No title")) snippet = html.escape(result.get("snippet", "No snippet")) url = html.escape(result.get("url", "#")) html_content += f"""
{title}
{url}

{snippet}

""" encoded_query = quote(query) prev_link = f'Previous' if page > 1 else 'Previous' next_link = f'Next' if page < total_pages else 'Next' html_content += f"""
""" return render_template_string(html_content) if __name__ == '__main__': app.run(debug=True, host='0.0.0.0', port=int(os.environ.get("PORT", 5000)))