|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
|
|
custom_headers = { |
|
"Accept-language": "en-GB,en;q=0.9", |
|
"Accept-Encoding": "gzip, deflate, br", |
|
"Cache-Control": "max-age=0", |
|
"Connection": "keep-alive", |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' |
|
} |
|
|
|
def get_soup(url): |
|
response = requests.get(url, headers=custom_headers) |
|
|
|
if response.status_code != 200: |
|
print("Error in getting webpage") |
|
print(f"Error: {response.status_code} - {response.reason}") |
|
exit(-1) |
|
|
|
soup = BeautifulSoup(response.text, "lxml") |
|
return soup |
|
|
|
def get_reviews(soup): |
|
review_elements = soup.select("div.review") |
|
|
|
scraped_reviews = [] |
|
|
|
for review in review_elements: |
|
r_author_element = review.select_one("span.a-profile-name") |
|
r_author = r_author_element.text if r_author_element else None |
|
|
|
r_rating_element = review.select_one("i.review-rating") |
|
r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None |
|
|
|
r_title_element = review.select_one("a.review-title") |
|
r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None |
|
r_title = r_title_span_element.text if r_title_span_element else None |
|
|
|
r_content_element = review.select_one("span.review-text") |
|
r_content = r_content_element.text if r_content_element else None |
|
|
|
r_date_element = review.select_one("span.review-date") |
|
r_date = r_date_element.text if r_date_element else None |
|
|
|
r_verified_element = review.select_one("span.a-size-mini") |
|
r_verified = r_verified_element.text if r_verified_element else None |
|
|
|
r_image_element = review.select_one("img.review-image-tile") |
|
r_image = r_image_element.attrs["src"] if r_image_element else None |
|
|
|
r = { |
|
"author": r_author, |
|
"rating": r_rating, |
|
"title": r_title, |
|
"content": r_content, |
|
"date": r_date, |
|
"verified": r_verified, |
|
"image_url": r_image |
|
} |
|
|
|
scraped_reviews.append(r) |
|
|
|
return scraped_reviews |
|
|
|
def scrape_all_pages(url): |
|
all_reviews = [] |
|
|
|
page_number = 1 |
|
while True: |
|
soup = get_soup(f"{url}&pageNumber={page_number}") |
|
reviews = get_reviews(soup) |
|
|
|
if not reviews: |
|
break |
|
|
|
all_reviews.extend(reviews) |
|
page_number += 1 |
|
|
|
return all_reviews |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|