rag-chat-botapi

Sleeping

App Files Files Community

Pamudu13 commited on Jan 26

Commit

4dd1d1c

verified ·

1 Parent(s): 4205a9b

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -220

app.py CHANGED Viewed

@@ -1,175 +1,134 @@
 from flask import Flask, jsonify, request
-import requests
 from bs4 import BeautifulSoup
-import os
-import re
-import urllib.parse
 import time
 import random
-import base64
 from io import BytesIO
-from urllib.parse import urlparse
-import html2text
 import json
 app = Flask(__name__)
-def get_google_search_results(query, num_results=5):
-    """Get search results from Google with rotating User-Agents"""
-    user_agents = [
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
-        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
-    ]
-    headers = {
-        'User-Agent': random.choice(user_agents),
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1'
-    }
-    # Add search parameters
-    params = {
-        'q': query,
-        'num': num_results + 5,  # Request extra results in case some fail
-        'hl': 'en',
-        'safe': 'active'
-    }
-    try:
-        response = requests.get(
-            'https://www.google.com/search',
-            headers=headers,
-            params=params,
-            timeout=30
-        )
-        response.raise_for_status()
-        return response.text
-    except Exception as e:
-        print(f"Search error: {str(e)}")
-        return None
-def search_images(query, num_images=5):
-    """Enhanced image search function"""
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-    }
-    # Format the query for URL
-    formatted_query = urllib.parse.quote(query)
-    # Multiple search URLs to try
-    search_urls = [
-        f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active",
-        f"https://www.bing.com/images/search?q={formatted_query}&safesearch=strict",
-    ]
     results = []
-    for search_url in search_urls:
-        if len(results) >= num_images:
-            break
-        try:
-            response = requests.get(search_url, headers=headers, timeout=30)
-            response.raise_for_status()
-            # Find image URLs using multiple regex patterns
-            patterns = [
-                r'https?://[^"\']*?(?:jpg|jpeg|png|gif)',
-                r'"ou":"(https?://[^"]*?(?:jpg|jpeg|png|gif))"',
-                r'murl&quot;:&quot;(.*?)&quot;'
-            ]
-            image_urls = []
-            for pattern in patterns:
-                found_urls = re.findall(pattern, response.text)
-                image_urls.extend(found_urls if isinstance(found_urls[0], str) else found_urls[0] for found_urls in [found_urls] if found_urls)
-            # Remove duplicates while preserving order
-            image_urls = list(dict.fromkeys(image_urls))
-            for img_url in image_urls:
-                if len(results) >= num_images:
-                    break
-                try:
-                    # Skip unwanted URLs
-                    if any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com', 'bing.com']):
-                        continue
-                    # Download image with timeout
-                    img_response = requests.get(img_url, headers=headers, timeout=10)
-                    img_response.raise_for_status()
-                    # Verify content type
-                    content_type = img_response.headers.get('Content-Type', '')
-                    if not content_type.startswith('image/'):
-                        continue
-                    # Check minimum image size (1KB)
-                    if len(img_response.content) < 1024:
-                        continue
-                    # Convert to base64
-                    image_base64 = base64.b64encode(img_response.content).decode('utf-8')
-                    results.append({
-                        'image_url': img_url,
-                        'base64_data': f"data:{content_type};base64,{image_base64}",
-                        'size': len(img_response.content),
-                        'content_type': content_type
-                    })
-                    # Random delay between downloads
-                    time.sleep(random.uniform(0.5, 1.5))
-                except Exception as e:
-                    print(f"Error downloading image {img_url}: {str(e)}")
                     continue
-        except Exception as e:
-            print(f"Error with search URL {search_url}: {str(e)}")
-            continue
-    return results
-def scrape_website(url, headers):
-    """Enhanced website scraping function"""
-    try:
-        response = requests.get(url, headers=headers, timeout=15)
-        response.raise_for_status()
-        # Detect and handle encoding
-        if 'charset' in response.headers.get('content-type', '').lower():
-            response.encoding = response.apparent_encoding
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove unwanted elements
-        for element in soup.find_all(['script', 'style', 'nav', 'footer', 'iframe', 'ad', '.advertisement']):
-            element.decompose()
-        # Get meta information
         meta_data = {
-            'title': '',
             'description': '',
             'keywords': '',
             'author': '',
             'published_date': ''
         }
-        # Title
-        if soup.title:
-            meta_data['title'] = soup.title.string
         # Meta tags
         meta_tags = {
             'description': ['description', 'og:description'],
@@ -185,19 +144,15 @@ def scrape_website(url, headers):
                     meta_data[key] = meta_tag.get('content')
                     break
-        # Extract main content
         main_content = ''
-        content_tags = soup.find_all(['p', 'article', 'section', 'div'], class_=re.compile(r'(content|article|post|entry)'))
         if content_tags:
-            for tag in content_tags:
-                main_content += ' ' + tag.get_text()
         else:
-            # Fallback to all paragraph tags
-            main_content = ' '.join(p.get_text() for p in soup.find_all('p'))
-        # Clean the text
-        main_content = clean_text(main_content)
         return {
             'title': clean_text(meta_data['title']),
@@ -205,7 +160,7 @@ def scrape_website(url, headers):
             'keywords': clean_text(meta_data['keywords']),
             'author': clean_text(meta_data['author']),
             'published_date': meta_data['published_date'],
-            'content': main_content[:2000],  # First 2000 characters
             'url': url,
             'domain': get_domain(url)
         }
@@ -214,95 +169,75 @@ def scrape_website(url, headers):
         print(f"Error scraping {url}: {str(e)}")
         return None
 def clean_text(text):
-    """Enhanced text cleaning function"""
     if not text:
         return ''
-    # Convert to string if not already
     text = str(text)
-    # Remove HTML tags
-    text = re.sub(r'<[^>]+>', '', text)
-    # Remove extra whitespace
     text = re.sub(r'\s+', ' ', text)
-    # Remove special characters but keep basic punctuation
     text = re.sub(r'[^\w\s.,!?-]', '', text)
-    # Remove multiple punctuation
-    text = re.sub(r'([.,!?])\1+', r'\1', text)
     return text.strip()
 def get_domain(url):
-    """Extract and format domain from URL"""
     try:
-        parsed_uri = urlparse(url)
-        domain = parsed_uri.netloc
-        # Remove 'www.' if present
-        domain = re.sub(r'^www\.', '', domain)
-        return domain
     except:
         return url
-def search_and_scrape(query, num_results=5):
-    """Enhanced search and scrape function"""
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    }
-    # Get search results HTML
-    search_html = get_google_search_results(query, num_results)
-    if not search_html:
-        return []
-    soup = BeautifulSoup(search_html, 'html.parser')
-    search_results = []
-    seen_domains = set()
-    # Find all search result divs
-    for result in soup.find_all('div', class_=['g', 'tF2Cxc']):
-        if len(search_results) >= num_results:
-            break
-        try:
-            # Find the link
-            link = result.find('a')
-            if not link:
-                continue
-            href = link.get('href', '')
-            # Basic URL validation
-            if not href.startswith('http') or any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
-                continue
-            # Check for duplicate domains
-            domain = get_domain(href)
-            if domain in seen_domains:
-                continue
-            seen_domains.add(domain)
-            # Random delay between requests
-            time.sleep(random.uniform(1, 2))
-            # Scrape the website
-            site_data = scrape_website(href, headers)
-            if site_data and site_data['content']:
-                search_results.append(site_data)
-        except Exception as e:
-            print(f"Error processing search result: {str(e)}")
-            continue
-    return search_results
 @app.route('/search_images', methods=['GET'])
 def api_search_images():
-    """API endpoint for image search"""
     try:
         query = request.args.get('query', '')
         num_images = int(request.args.get('num_images', 5))
@@ -330,7 +265,6 @@ def api_search_images():
 @app.route('/scrape_sites', methods=['GET'])
 def api_scrape_sites():
-    """API endpoint for web scraping"""
     try:
         query = request.args.get('query', '')
         num_results = int(request.args.get('num_results', 5))
@@ -356,6 +290,12 @@ def api_scrape_sites():
             'error': str(e)
         }), 500
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)

 from flask import Flask, jsonify, request
+import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
 from bs4 import BeautifulSoup
+import base64
 import time
 import random
+import re
+import requests
 from io import BytesIO
+from PIL import Image
 import json
+import threading
+from urllib.parse import quote, urlparse
+import html2text
 app = Flask(__name__)
+# Thread-local storage for the browser instance
+thread_local = threading.local()
+def get_browser():
+    """Get or create thread-local browser instance"""
+    if not hasattr(thread_local, "browser"):
+        chrome_options = uc.ChromeOptions()
+        chrome_options.add_argument('--headless')
+        chrome_options.add_argument('--no-sandbox')
+        chrome_options.add_argument('--disable-dev-shm-usage')
+        chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_argument('--window-size=1920,1080')
+        thread_local.browser = uc.Chrome(options=chrome_options)
+    return thread_local.browser
+def search_images(query, num_images=5):
+    """Enhanced image search using selenium"""
+    browser = get_browser()
     results = []
+    try:
+        # Google Images search
+        search_url = f"https://www.google.com/search?q={quote(query)}&tbm=isch"
+        browser.get(search_url)
+        # Wait for images to load
+        time.sleep(2)
+        # Scroll to load more images
+        for _ in range(3):
+            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(1)
+        # Find image elements
+        image_elements = browser.find_elements(By.CSS_SELECTOR, 'img.rg_i')
+        for img in image_elements[:num_images]:
+            try:
+                # Click image to get full resolution
+                img.click()
+                time.sleep(1)
+                # Wait for full resolution image
+                wait = WebDriverWait(browser, 10)
+                full_img = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'img.n3VNCb')))
+                img_url = full_img.get_attribute('src')
+                # Skip data URLs and unwanted domains
+                if (img_url.startswith('data:') or
+                    any(domain in img_url.lower() for domain in ['gstatic.com', 'google.com'])):
                     continue
+                # Download and process image
+                response = requests.get(img_url, timeout=10)
+                img_content = response.content
+                # Verify it's an image and get format
+                img = Image.open(BytesIO(img_content))
+                img_format = img.format.lower()
+                # Convert to base64
+                buffered = BytesIO()
+                img.save(buffered, format=img_format)
+                img_base64 = base64.b64encode(buffered.getvalue()).decode()
+                results.append({
+                    'image_url': img_url,
+                    'base64_data': f"data:image/{img_format};base64,{img_base64}",
+                    'size': len(img_content),
+                    'dimensions': img.size,
+                    'format': img_format
+                })
+                time.sleep(random.uniform(0.5, 1.0))
+            except Exception as e:
+                print(f"Error processing image: {str(e)}")
+                continue
+            if len(results) >= num_images:
+                break
+    except Exception as e:
+        print(f"Search error: {str(e)}")
+    return results
+def scrape_website(url):
+    """Enhanced website scraping using selenium"""
+    browser = get_browser()
+    try:
+        browser.get(url)
+        time.sleep(2)  # Wait for dynamic content
+        # Get page source after JavaScript execution
+        page_source = browser.page_source
+        soup = BeautifulSoup(page_source, 'html.parser')
+        # Extract metadata and content
         meta_data = {
+            'title': soup.title.string if soup.title else '',
             'description': '',
             'keywords': '',
             'author': '',
             'published_date': ''
         }
         # Meta tags
         meta_tags = {
             'description': ['description', 'og:description'],
                     meta_data[key] = meta_tag.get('content')
                     break
+        # Get main content
         main_content = ''
+        content_tags = soup.find_all(['article', 'main', 'div'],
+                                   class_=re.compile(r'(content|article|post|entry)'))
         if content_tags:
+            main_content = ' '.join(tag.get_text(strip=True) for tag in content_tags)
         else:
+            main_content = ' '.join(p.get_text(strip=True) for p in soup.find_all('p'))
         return {
             'title': clean_text(meta_data['title']),
             'keywords': clean_text(meta_data['keywords']),
             'author': clean_text(meta_data['author']),
             'published_date': meta_data['published_date'],
+            'content': clean_text(main_content)[:2000],
             'url': url,
             'domain': get_domain(url)
         }
         print(f"Error scraping {url}: {str(e)}")
         return None
+def search_and_scrape(query, num_results=5):
+    """Enhanced search and scrape using selenium"""
+    browser = get_browser()
+    results = []
+    try:
+        # Perform Google search
+        search_url = f"https://www.google.com/search?q={quote(query)}&num={num_results + 5}"
+        browser.get(search_url)
+        time.sleep(2)
+        # Get search results
+        search_results = browser.find_elements(By.CSS_SELECTOR, 'div.g')
+        seen_domains = set()
+        for result in search_results:
+            if len(results) >= num_results:
+                break
+            try:
+                link = result.find_element(By.CSS_SELECTOR, 'a')
+                href = link.get_attribute('href')
+                # Skip unwanted URLs
+                if not href or not href.startswith('http') or \
+                   any(x in href.lower() for x in ['google.', 'youtube.', 'facebook.', 'twitter.']):
+                    continue
+                # Check for duplicate domains
+                domain = get_domain(href)
+                if domain in seen_domains:
+                    continue
+                seen_domains.add(domain)
+                # Scrape website
+                site_data = scrape_website(href)
+                if site_data and site_data['content']:
+                    results.append(site_data)
+                time.sleep(random.uniform(1, 2))
+            except Exception as e:
+                print(f"Error processing result: {str(e)}")
+                continue
+    except Exception as e:
+        print(f"Search error: {str(e)}")
+    return results
 def clean_text(text):
+    """Clean extracted text"""
     if not text:
         return ''
     text = str(text)
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'[^\w\s.,!?-]', '', text)
     return text.strip()
 def get_domain(url):
+    """Extract domain from URL"""
     try:
+        return urlparse(url).netloc.replace('www.', '')
     except:
         return url
 @app.route('/search_images', methods=['GET'])
 def api_search_images():
     try:
         query = request.args.get('query', '')
         num_images = int(request.args.get('num_images', 5))
 @app.route('/scrape_sites', methods=['GET'])
 def api_scrape_sites():
     try:
         query = request.args.get('query', '')
         num_results = int(request.args.get('num_results', 5))
             'error': str(e)
         }), 500
+@app.teardown_appcontext
+def cleanup(exception=None):
+    """Clean up browser instances"""
+    if hasattr(thread_local, "browser"):
+        thread_local.browser.quit()
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=5000)