import gradio as gr import requests from bs4 import BeautifulSoup import pandas as pd from selenium import webdriver from selenium.webdriver.chrome.options import Options import csv import random import time import os import subprocess import chromedriver_autoinstaller # List of user agents to avoid bot detection USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0" ] # Install Chrome and dependencies dynamically def install_chrome(): try: # Update package list and install Chrome dependencies subprocess.run("apt-get update", shell=True, check=True) subprocess.run( "apt-get install -y libxss1 libappindicator1 libindicator7 fonts-liberation libnss3 xdg-utils unzip", shell=True, check=True ) # Download and install Chrome subprocess.run( "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb", shell=True, check=True ) subprocess.run("dpkg -i google-chrome-stable_current_amd64.deb || apt-get install -f -y", shell=True, check=True) # Install ChromeDriver chromedriver_autoinstaller.install() except Exception as e: raise Exception(f"Failed to install Chrome: {str(e)}") # Function to initialize Selenium driver def get_driver(): chrome_options = Options() chrome_options.add_argument("--headless") # Run in headless mode chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) return driver # Function to scrape Flipkart laptop data def scrape_flipkart(url): try: # Ensure Chrome is installed if not os.path.exists("/usr/bin/google-chrome"): install_chrome() # Set up Selenium driver driver = get_driver() headers = {"User-Agent": random.choice(USER_AGENTS)} # Load the page driver.get(url) time.sleep(5) # Wait for JavaScript to load # Get page source and parse with BeautifulSoup soup = BeautifulSoup(driver.page_source, "html.parser") driver.quit() # Close the driver # Lists to store scraped data products = [] prices = [] ratings = [] # Find laptop items (adjust class names based on Flipkart's HTML structure) items = soup.find_all("div", class_="_1AtVbE") # Parent div for each product for item in items: # Product name name_tag = item.find("div", class_="_4rR01T") name = name_tag.text.strip() if name_tag else "N/A" # Price price_tag = item.find("div", class_="_30jeq3") price = price_tag.text.strip() if price_tag else "N/A" # Rating rating_tag = item.find("div", class_="_3LWZlK") rating = rating_tag.text.strip() if rating_tag else "N/A" if name != "N/A": # Only append valid entries products.append(name) prices.append(price) ratings.append(rating) # Create DataFrame df = pd.DataFrame({ "Product Name": products, "Price": prices, "Rating": ratings }) # Save to CSV csv_path = "flipkart_laptops.csv" df.to_csv(csv_path, index=False, encoding="utf-8") return f"Scraped {len(products)} laptops successfully!", csv_path except Exception as e: return f"Error: {str(e)}", None # Gradio interface with gr.Blocks(title="Flipkart Laptop Scraper") as demo: gr.Markdown("# Flipkart Laptop Scraper") gr.Markdown("Enter a Flipkart laptop category URL to scrape data and download as CSV.") url_input = gr.Textbox(label="Flipkart URL", placeholder="e.g., https://www.flipkart.com/laptops/pr?sid=6bo,b5g") scrape_btn = gr.Button("Scrape Data") output_text = gr.Textbox(label="Status") output_file = gr.File(label="Download CSV") scrape_btn.click( fn=scrape_flipkart, inputs=url_input, outputs=[output_text, output_file] ) demo.launch()