File size: 5,684 Bytes
ab36536 f6d39c8 ab36536 f6d39c8 99d7b92 f6d39c8 ab36536 99d7b92 f6d39c8 ab36536 f6d39c8 99d7b92 f6d39c8 ab36536 f6d39c8 ab36536 99d7b92 f6d39c8 fb24c70 f6d39c8 ab36536 26b9192 f6d39c8 99d7b92 ab36536 f6d39c8 99d7b92 ab36536 99d7b92 ab36536 99d7b92 f6d39c8 ab36536 99d7b92 f6d39c8 ab36536 99d7b92 f6d39c8 ab36536 f6d39c8 99d7b92 f6d39c8 99d7b92 ab36536 f6d39c8 ab36536 99d7b92 ab36536 fb24c70 f6d39c8 99d7b92 f6d39c8 99d7b92 ab36536 f6d39c8 99d7b92 ab36536 f6d39c8 99d7b92 f6d39c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pandas as pd
import os
import logging
logger = logging.getLogger(__name__)
def comprehensive_scroll(driver):
"""Scroll until no more new content is loaded"""
try:
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
except Exception as e:
logger.error(f"Scroll sırasında hata: {str(e)}")
def scrape_reviews(url):
"""URL'den yorumları çeken fonksiyon"""
driver = None
try:
# Data directory oluştur
data_directory = "data"
if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Chrome options ayarları
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--window-size=1920,1080")
# Linux için ChromeDriver ayarı
try:
# Önce /usr/local/bin/chromedriver'ı dene
service = Service('/usr/local/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
except:
try:
# Eğer başarısız olursa /usr/bin/chromedriver'ı dene
service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
except:
# Son olarak PATH'teki chromedriver'ı dene
service = Service('chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info(f"URL'ye erişiliyor: {url}")
driver.get(url)
# Çerez popup'ını kabul et
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
).click()
logger.info("Çerez popup'ı kabul edildi")
except TimeoutException:
logger.warning("Çerez popup'ı bulunamadı veya tıklanamadı")
logger.info("Sayfa kaydırılıyor...")
comprehensive_scroll(driver)
logger.info("Yorumlar toplanıyor...")
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div')
total_comments = len(comment_elements)
logger.info(f"Toplam {total_comments} yorum bulundu")
data = []
for i in range(1, total_comments + 1):
try:
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]'
username = driver.find_element(By.XPATH, username_xpath).text
except NoSuchElementException:
username = "N/A"
try:
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p'
comment = driver.find_element(By.XPATH, comment_xpath).text
except NoSuchElementException:
comment = "N/A"
try:
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]'
date = driver.find_element(By.XPATH, date_xpath).text
except NoSuchElementException:
date = "N/A"
try:
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div'
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']")
star_count = len(full_stars)
except NoSuchElementException:
star_count = 0
data.append({
"Kullanıcı_id": i,
"Kullanıcı Adı": username,
"Yorum": comment,
"Tarih": date,
"Yıldız Sayısı": star_count
})
if i % 10 == 0:
logger.info(f"{i}/{total_comments} yorum toplandı")
df = pd.DataFrame(data)
# Geçici dosya olarak kaydet
temp_file = os.path.join(data_directory, 'temp_comments.csv')
df.to_csv(temp_file, index=False, encoding='utf-8-sig')
logger.info(f"Veriler {temp_file} dosyasına kaydedildi")
return df
except Exception as e:
logger.error(f"Veri çekme sırasında hata: {str(e)}")
return pd.DataFrame()
finally:
if driver:
driver.quit()
logger.info("Chrome driver kapatıldı")
# Geçici dosyayı sil
temp_file = os.path.join("data", 'temp_comments.csv')
if os.path.exists(temp_file):
os.remove(temp_file)
logger.info("Geçici dosya silindi") |