File size: 4,551 Bytes
3515cf9
07b96fa
 
 
 
 
 
3515cf9
07b96fa
 
ddd5812
 
3515cf9
07b96fa
 
 
 
 
 
3515cf9
ddd5812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07b96fa
 
 
 
 
 
 
3515cf9
07b96fa
 
 
ddd5812
 
 
 
07b96fa
 
 
 
 
 
ddd5812
07b96fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3515cf9
07b96fa
 
3515cf9
 
07b96fa
 
 
3515cf9
07b96fa
 
 
 
3515cf9
07b96fa
 
 
 
 
3515cf9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import random
import time
import os
import subprocess
import chromedriver_autoinstaller

# List of user agents to avoid bot detection
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"
]

# Install Chrome and dependencies dynamically
def install_chrome():
    try:
        # Update package list and install Chrome dependencies
        subprocess.run("apt-get update", shell=True, check=True)
        subprocess.run(
            "apt-get install -y libxss1 libappindicator1 libindicator7 fonts-liberation libnss3 xdg-utils unzip", 
            shell=True, check=True
        )
        # Download and install Chrome
        subprocess.run(
            "wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb", 
            shell=True, check=True
        )
        subprocess.run("dpkg -i google-chrome-stable_current_amd64.deb || apt-get install -f -y", shell=True, check=True)
        # Install ChromeDriver
        chromedriver_autoinstaller.install()
    except Exception as e:
        raise Exception(f"Failed to install Chrome: {str(e)}")

# Function to initialize Selenium driver
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

# Function to scrape Flipkart laptop data
def scrape_flipkart(url):
    try:
        # Ensure Chrome is installed
        if not os.path.exists("/usr/bin/google-chrome"):
            install_chrome()
        
        # Set up Selenium driver
        driver = get_driver()
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        
        # Load the page
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load
        
        # Get page source and parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")
        driver.quit()  # Close the driver
        
        # Lists to store scraped data
        products = []
        prices = []
        ratings = []
        
        # Find laptop items (adjust class names based on Flipkart's HTML structure)
        items = soup.find_all("div", class_="_1AtVbE")  # Parent div for each product
        for item in items:
            # Product name
            name_tag = item.find("div", class_="_4rR01T")
            name = name_tag.text.strip() if name_tag else "N/A"
            
            # Price
            price_tag = item.find("div", class_="_30jeq3")
            price = price_tag.text.strip() if price_tag else "N/A"
            
            # Rating
            rating_tag = item.find("div", class_="_3LWZlK")
            rating = rating_tag.text.strip() if rating_tag else "N/A"
            
            if name != "N/A":  # Only append valid entries
                products.append(name)
                prices.append(price)
                ratings.append(rating)
        
        # Create DataFrame
        df = pd.DataFrame({
            "Product Name": products,
            "Price": prices,
            "Rating": ratings
        })
        
        # Save to CSV
        csv_path = "flipkart_laptops.csv"
        df.to_csv(csv_path, index=False, encoding="utf-8")
        
        return f"Scraped {len(products)} laptops successfully!", csv_path
    
    except Exception as e:
        return f"Error: {str(e)}", None

# Gradio interface
with gr.Blocks(title="Flipkart Laptop Scraper") as demo:
    gr.Markdown("# Flipkart Laptop Scraper")
    gr.Markdown("Enter a Flipkart laptop category URL to scrape data and download as CSV.")
    
    url_input = gr.Textbox(label="Flipkart URL", placeholder="e.g., https://www.flipkart.com/laptops/pr?sid=6bo,b5g")
    scrape_btn = gr.Button("Scrape Data")
    output_text = gr.Textbox(label="Status")
    output_file = gr.File(label="Download CSV")
    
    scrape_btn.click(
        fn=scrape_flipkart,
        inputs=url_input,
        outputs=[output_text, output_file]
    )

demo.launch()