Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
import time | |
import pymorphy2 | |
import re | |
import cloudscraper | |
from datetime import datetime, timedelta | |
from transformers import pipeline | |
from bs4 import BeautifulSoup | |
###### | |
class NewsData: | |
def __init__(self) -> None: | |
""" | |
Парсер статей с ru.investing.com. | |
""" | |
self.urls = [ | |
("https://ru.investing.com/news/forex-news/", "forex-news"), | |
("https://ru.investing.com/news/commodities-news/", "commodities-news"), | |
] | |
self.headers = self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Connection': 'keep-alive', | |
'Referer': 'https://www.google.com/' | |
} | |
self.proxies = { | |
"http": "http://82.146.37.145:80", | |
"https": "https://82.146.37.145:80" | |
} | |
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму | |
self.scraper = cloudscraper.create_scraper() | |
def get_data(self): | |
# Парсим сайты за последнии 7 дней | |
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1) | |
articles_data = [] | |
for base_url, tag in self.urls: | |
page = 1 | |
while True: | |
url = f"{base_url}{page}/" | |
#response = requests.get(url, headers=self.headers) | |
response = self.scraper.get(url, headers=self.headers, proxies=self.proxies) | |
print(response) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, "html.parser") | |
articles = soup.find_all("a", attrs={"data-test": "article-title-link"}) | |
daily_count = 0 | |
for article in articles: | |
title = article.text.strip() | |
date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"}) | |
url = article["href"] | |
if date_tag: | |
publish_date_str = date_tag["datetime"] | |
publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S") | |
articles_data.append([title, publish_date_str, tag, url]) | |
daily_count += 1 | |
page += 1 | |
if publish_date < date_limit: | |
print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.") | |
print(f"Общее количество скачанных статей: {len(articles_data)}") | |
break | |
else: | |
if response.status_code == 404: | |
break | |
print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}") | |
time.sleep(5) | |
return self.get_proccesing_news(articles_data) | |
def clean_text(self, text): | |
text = re.sub(r'[^а-яА-Я\s]', '', text) | |
text = text.lower() | |
text = ' '.join([word for word in text.split() if len(word) >= 3]) | |
text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()]) | |
return text | |
def get_proccesing_news(self, articles_data): | |
# Обрабатываем полученные данные, добавляем сентиментальный анализ. | |
sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt") | |
news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url']) | |
news['title'] = news['title'].ffill().map(self.clean_text) | |
news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label']) | |
news = pd.get_dummies(news, columns=['sentiment']) | |
news['DATE'] = pd.to_datetime(news['DATE']) | |
news['day'] = news['DATE'].dt.day | |
news['year'] = news['DATE'].dt.year | |
news['month'] = news['DATE'].dt.month | |
news['hour'] = news['DATE'].dt.hour | |
return news | |