KotVasily commited on
Commit
a4742d0
·
verified ·
1 Parent(s): a4e656f

Update news.py

Browse files
Files changed (1) hide show
  1. news.py +7 -4
news.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
  import time
4
  import pymorphy2
5
  import re
 
6
 
7
  from datetime import datetime, timedelta
8
  from transformers import pipeline
@@ -26,7 +27,8 @@ class NewsData:
26
  }
27
 
28
  self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
29
-
 
30
  def get_data(self):
31
  # Парсим сайты за последнии 7 дней
32
  date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
@@ -36,10 +38,11 @@ class NewsData:
36
  page = 1
37
  while True:
38
  url = f"{base_url}{page}/"
39
- response = requests.get(url, headers=self.headers)
40
-
 
41
  if response.status_code == 200:
42
- soup = BeautifulSoup(response.content, "html.parser")
43
  articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
44
 
45
  daily_count = 0
 
3
  import time
4
  import pymorphy2
5
  import re
6
+ import cloudscraper
7
 
8
  from datetime import datetime, timedelta
9
  from transformers import pipeline
 
27
  }
28
 
29
  self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
30
+ self.scraper = cloudscraper.create_scraper()
31
+
32
  def get_data(self):
33
  # Парсим сайты за последнии 7 дней
34
  date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
 
38
  page = 1
39
  while True:
40
  url = f"{base_url}{page}/"
41
+ #response = requests.get(url, headers=self.headers)
42
+ responce = scraper.get(url, headers=self.headers)
43
+
44
  if response.status_code == 200:
45
+ soup = BeautifulSoup(response.text, "html.parser")
46
  articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
47
 
48
  daily_count = 0