max-unfinity commited on
Commit
4c404f5
·
1 Parent(s): ea2c254
Files changed (4) hide show
  1. app.py +25 -0
  2. init_env.py +18 -0
  3. requirements.txt +3 -0
  4. selenium_parser.py +85 -0
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from init_env import init_env
2
+ from selenium_parser import load_driver
3
+ import streamlit as st
4
+
5
+
6
+ @st.cache
7
+ def init():
8
+ init_env()
9
+
10
+
11
+ @st.cache_resource
12
+ def get_driver():
13
+ return load_driver()
14
+
15
+
16
+ def run():
17
+ driver.get("https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews")
18
+ st.write("page loaded")
19
+ st.image(driver.get_screenshot_as_png(), caption="screenshot")
20
+
21
+
22
+ init()
23
+ driver = get_driver()
24
+
25
+ st.button("Run", on_click=run)
init_env.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tarfile
3
+ import requests
4
+
5
+
6
+ def init_env():
7
+ # download geckodriver
8
+ print("Downloading geckodriver...")
9
+ r = requests.get('https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz')
10
+ with open('geckodriver-v0.34.0-linux64.tar.gz', 'wb') as f:
11
+ f.write(r.content)
12
+ # extract geckodriver
13
+ tar = tarfile.open('geckodriver-v0.34.0-linux64.tar.gz')
14
+ tar.extractall()
15
+ tar.close()
16
+
17
+ # add geckodriver to PATH variable
18
+ os.environ["PATH"] += os.pathsep + os.getcwd()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ selenium==4.17.2
2
+ beautifulsoup4==4.11.1
3
+ streamlit # 1.30.0
selenium_parser.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver import FirefoxOptions
3
+ from selenium.webdriver.common.keys import Keys
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.wait import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from bs4 import BeautifulSoup
8
+ import time
9
+
10
+
11
+ def load_driver():
12
+ print("Loading driver...")
13
+ opts = FirefoxOptions()
14
+ opts.add_argument("--headless")
15
+ driver = webdriver.Firefox(options=opts)
16
+ return driver
17
+
18
+
19
+ def parse_review(html):
20
+ # Review text
21
+ soup = BeautifulSoup(html, 'html.parser')
22
+ positive_review = None
23
+ negative_review = None
24
+ rows = soup.find_all("div", class_="c-review__row")
25
+ for row in rows:
26
+ if row.find("span", class_="c-review__translation-loader"):
27
+ continue
28
+ delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
29
+ review_text = row.find("span", class_='c-review__body').text.strip()
30
+ if delimiter == "Понравилось":
31
+ positive_review = review_text
32
+ elif delimiter == "Не понравилось":
33
+ negative_review = review_text
34
+ else:
35
+ raise ValueError()
36
+
37
+ # Room name
38
+ room_info = soup.find('div', class_='c-review-block__room-info-row')
39
+ room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None
40
+
41
+ # Datetime of the review
42
+ # datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)
43
+
44
+ # Number of nights + date
45
+ stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
46
+ date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")
47
+
48
+ return {
49
+ "positive": positive_review,
50
+ "negative": negative_review,
51
+ "room": room_name,
52
+ "time": date_info
53
+ }
54
+
55
+
56
+ def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 1):
57
+ # url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
58
+ review_infos = []
59
+
60
+ driver.get(url)
61
+ print("page loaded")
62
+
63
+ for i in range(page_count):
64
+ WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
65
+ time.sleep(wait_time)
66
+
67
+ # Remove cookie banner
68
+ try:
69
+ driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
70
+ except:
71
+ pass
72
+
73
+ elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")
74
+
75
+ for elem in elems:
76
+ html = elem.get_attribute('outerHTML')
77
+ review_info = parse_review(html)
78
+ review_infos.append(review_info)
79
+
80
+ print(f"Done page {i+1} of {page_count}")
81
+
82
+ pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
83
+ pagenext.click()
84
+
85
+ return review_infos