Spaces:
Runtime error
Runtime error
max-unfinity
commited on
Commit
·
4c404f5
1
Parent(s):
ea2c254
add files
Browse files- app.py +25 -0
- init_env.py +18 -0
- requirements.txt +3 -0
- selenium_parser.py +85 -0
app.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from init_env import init_env
|
2 |
+
from selenium_parser import load_driver
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
|
6 |
+
@st.cache
|
7 |
+
def init():
|
8 |
+
init_env()
|
9 |
+
|
10 |
+
|
11 |
+
@st.cache_resource
|
12 |
+
def get_driver():
|
13 |
+
return load_driver()
|
14 |
+
|
15 |
+
|
16 |
+
def run():
|
17 |
+
driver.get("https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews")
|
18 |
+
st.write("page loaded")
|
19 |
+
st.image(driver.get_screenshot_as_png(), caption="screenshot")
|
20 |
+
|
21 |
+
|
22 |
+
init()
|
23 |
+
driver = get_driver()
|
24 |
+
|
25 |
+
st.button("Run", on_click=run)
|
init_env.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tarfile
|
3 |
+
import requests
|
4 |
+
|
5 |
+
|
6 |
+
def init_env():
|
7 |
+
# download geckodriver
|
8 |
+
print("Downloading geckodriver...")
|
9 |
+
r = requests.get('https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz')
|
10 |
+
with open('geckodriver-v0.34.0-linux64.tar.gz', 'wb') as f:
|
11 |
+
f.write(r.content)
|
12 |
+
# extract geckodriver
|
13 |
+
tar = tarfile.open('geckodriver-v0.34.0-linux64.tar.gz')
|
14 |
+
tar.extractall()
|
15 |
+
tar.close()
|
16 |
+
|
17 |
+
# add geckodriver to PATH variable
|
18 |
+
os.environ["PATH"] += os.pathsep + os.getcwd()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
selenium==4.17.2
|
2 |
+
beautifulsoup4==4.11.1
|
3 |
+
streamlit # 1.30.0
|
selenium_parser.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver import FirefoxOptions
|
3 |
+
from selenium.webdriver.common.keys import Keys
|
4 |
+
from selenium.webdriver.common.by import By
|
5 |
+
from selenium.webdriver.support.wait import WebDriverWait
|
6 |
+
from selenium.webdriver.support import expected_conditions as EC
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import time
|
9 |
+
|
10 |
+
|
11 |
+
def load_driver():
|
12 |
+
print("Loading driver...")
|
13 |
+
opts = FirefoxOptions()
|
14 |
+
opts.add_argument("--headless")
|
15 |
+
driver = webdriver.Firefox(options=opts)
|
16 |
+
return driver
|
17 |
+
|
18 |
+
|
19 |
+
def parse_review(html):
|
20 |
+
# Review text
|
21 |
+
soup = BeautifulSoup(html, 'html.parser')
|
22 |
+
positive_review = None
|
23 |
+
negative_review = None
|
24 |
+
rows = soup.find_all("div", class_="c-review__row")
|
25 |
+
for row in rows:
|
26 |
+
if row.find("span", class_="c-review__translation-loader"):
|
27 |
+
continue
|
28 |
+
delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
|
29 |
+
review_text = row.find("span", class_='c-review__body').text.strip()
|
30 |
+
if delimiter == "Понравилось":
|
31 |
+
positive_review = review_text
|
32 |
+
elif delimiter == "Не понравилось":
|
33 |
+
negative_review = review_text
|
34 |
+
else:
|
35 |
+
raise ValueError()
|
36 |
+
|
37 |
+
# Room name
|
38 |
+
room_info = soup.find('div', class_='c-review-block__room-info-row')
|
39 |
+
room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None
|
40 |
+
|
41 |
+
# Datetime of the review
|
42 |
+
# datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)
|
43 |
+
|
44 |
+
# Number of nights + date
|
45 |
+
stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
|
46 |
+
date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")
|
47 |
+
|
48 |
+
return {
|
49 |
+
"positive": positive_review,
|
50 |
+
"negative": negative_review,
|
51 |
+
"room": room_name,
|
52 |
+
"time": date_info
|
53 |
+
}
|
54 |
+
|
55 |
+
|
56 |
+
def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 1):
|
57 |
+
# url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
|
58 |
+
review_infos = []
|
59 |
+
|
60 |
+
driver.get(url)
|
61 |
+
print("page loaded")
|
62 |
+
|
63 |
+
for i in range(page_count):
|
64 |
+
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
|
65 |
+
time.sleep(wait_time)
|
66 |
+
|
67 |
+
# Remove cookie banner
|
68 |
+
try:
|
69 |
+
driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
|
70 |
+
except:
|
71 |
+
pass
|
72 |
+
|
73 |
+
elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")
|
74 |
+
|
75 |
+
for elem in elems:
|
76 |
+
html = elem.get_attribute('outerHTML')
|
77 |
+
review_info = parse_review(html)
|
78 |
+
review_infos.append(review_info)
|
79 |
+
|
80 |
+
print(f"Done page {i+1} of {page_count}")
|
81 |
+
|
82 |
+
pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
|
83 |
+
pagenext.click()
|
84 |
+
|
85 |
+
return review_infos
|