Spaces:
Sleeping
Sleeping
Update news.py
Browse files
news.py
CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
|
|
3 |
import time
|
4 |
import pymorphy2
|
5 |
import re
|
|
|
6 |
|
7 |
from datetime import datetime, timedelta
|
8 |
from transformers import pipeline
|
@@ -26,7 +27,8 @@ class NewsData:
|
|
26 |
}
|
27 |
|
28 |
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
|
29 |
-
|
|
|
30 |
def get_data(self):
|
31 |
# Парсим сайты за последнии 7 дней
|
32 |
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
|
@@ -36,10 +38,11 @@ class NewsData:
|
|
36 |
page = 1
|
37 |
while True:
|
38 |
url = f"{base_url}{page}/"
|
39 |
-
response = requests.get(url, headers=self.headers)
|
40 |
-
|
|
|
41 |
if response.status_code == 200:
|
42 |
-
soup = BeautifulSoup(response.
|
43 |
articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
|
44 |
|
45 |
daily_count = 0
|
|
|
3 |
import time
|
4 |
import pymorphy2
|
5 |
import re
|
6 |
+
import cloudscraper
|
7 |
|
8 |
from datetime import datetime, timedelta
|
9 |
from transformers import pipeline
|
|
|
27 |
}
|
28 |
|
29 |
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
|
30 |
+
self.scraper = cloudscraper.create_scraper()
|
31 |
+
|
32 |
def get_data(self):
|
33 |
# Парсим сайты за последнии 7 дней
|
34 |
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
|
|
|
38 |
page = 1
|
39 |
while True:
|
40 |
url = f"{base_url}{page}/"
|
41 |
+
#response = requests.get(url, headers=self.headers)
|
42 |
+
responce = scraper.get(url, headers=self.headers)
|
43 |
+
|
44 |
if response.status_code == 200:
|
45 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
46 |
articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
|
47 |
|
48 |
daily_count = 0
|