|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
import csv |
|
import random |
|
import time |
|
import os |
|
import subprocess |
|
import chromedriver_autoinstaller |
|
|
|
|
|
USER_AGENTS = [ |
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", |
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", |
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0" |
|
] |
|
|
|
|
|
def install_chrome(): |
|
try: |
|
|
|
subprocess.run("apt-get update", shell=True, check=True) |
|
subprocess.run( |
|
"apt-get install -y libxss1 libappindicator1 libindicator7 fonts-liberation libnss3 xdg-utils unzip", |
|
shell=True, check=True |
|
) |
|
|
|
subprocess.run( |
|
"wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb", |
|
shell=True, check=True |
|
) |
|
subprocess.run("dpkg -i google-chrome-stable_current_amd64.deb || apt-get install -f -y", shell=True, check=True) |
|
|
|
chromedriver_autoinstaller.install() |
|
except Exception as e: |
|
raise Exception(f"Failed to install Chrome: {str(e)}") |
|
|
|
|
|
def get_driver(): |
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_argument("--no-sandbox") |
|
chrome_options.add_argument("--disable-dev-shm-usage") |
|
driver = webdriver.Chrome(options=chrome_options) |
|
return driver |
|
|
|
|
|
def scrape_flipkart(url): |
|
try: |
|
|
|
if not os.path.exists("/usr/bin/google-chrome"): |
|
install_chrome() |
|
|
|
|
|
driver = get_driver() |
|
headers = {"User-Agent": random.choice(USER_AGENTS)} |
|
|
|
|
|
driver.get(url) |
|
time.sleep(5) |
|
|
|
|
|
soup = BeautifulSoup(driver.page_source, "html.parser") |
|
driver.quit() |
|
|
|
|
|
products = [] |
|
prices = [] |
|
ratings = [] |
|
|
|
|
|
items = soup.find_all("div", class_="_1AtVbE") |
|
for item in items: |
|
|
|
name_tag = item.find("div", class_="_4rR01T") |
|
name = name_tag.text.strip() if name_tag else "N/A" |
|
|
|
|
|
price_tag = item.find("div", class_="_30jeq3") |
|
price = price_tag.text.strip() if price_tag else "N/A" |
|
|
|
|
|
rating_tag = item.find("div", class_="_3LWZlK") |
|
rating = rating_tag.text.strip() if rating_tag else "N/A" |
|
|
|
if name != "N/A": |
|
products.append(name) |
|
prices.append(price) |
|
ratings.append(rating) |
|
|
|
|
|
df = pd.DataFrame({ |
|
"Product Name": products, |
|
"Price": prices, |
|
"Rating": ratings |
|
}) |
|
|
|
|
|
csv_path = "flipkart_laptops.csv" |
|
df.to_csv(csv_path, index=False, encoding="utf-8") |
|
|
|
return f"Scraped {len(products)} laptops successfully!", csv_path |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}", None |
|
|
|
|
|
with gr.Blocks(title="Flipkart Laptop Scraper") as demo: |
|
gr.Markdown("# Flipkart Laptop Scraper") |
|
gr.Markdown("Enter a Flipkart laptop category URL to scrape data and download as CSV.") |
|
|
|
url_input = gr.Textbox(label="Flipkart URL", placeholder="e.g., https://www.flipkart.com/laptops/pr?sid=6bo,b5g") |
|
scrape_btn = gr.Button("Scrape Data") |
|
output_text = gr.Textbox(label="Status") |
|
output_file = gr.File(label="Download CSV") |
|
|
|
scrape_btn.click( |
|
fn=scrape_flipkart, |
|
inputs=url_input, |
|
outputs=[output_text, output_file] |
|
) |
|
|
|
demo.launch() |