myfirspace / news.py
KotVasily's picture
Update news.py
c028831 verified
import requests
import pandas as pd
import time
import pymorphy2
import re
import cloudscraper
from datetime import datetime, timedelta
from transformers import pipeline
from bs4 import BeautifulSoup
######
class NewsData:
def __init__(self) -> None:
"""
Парсер статей с ru.investing.com.
"""
self.urls = [
("https://ru.investing.com/news/forex-news/", "forex-news"),
("https://ru.investing.com/news/commodities-news/", "commodities-news"),
]
self.headers = self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://www.google.com/'
}
self.proxies = {
"http": "http://82.146.37.145:80",
"https": "https://82.146.37.145:80"
}
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
self.scraper = cloudscraper.create_scraper()
def get_data(self):
# Парсим сайты за последнии 7 дней
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
articles_data = []
for base_url, tag in self.urls:
page = 1
while True:
url = f"{base_url}{page}/"
#response = requests.get(url, headers=self.headers)
response = self.scraper.get(url, headers=self.headers, proxies=self.proxies)
print(response)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
daily_count = 0
for article in articles:
title = article.text.strip()
date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"})
url = article["href"]
if date_tag:
publish_date_str = date_tag["datetime"]
publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")
articles_data.append([title, publish_date_str, tag, url])
daily_count += 1
page += 1
if publish_date < date_limit:
print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
print(f"Общее количество скачанных статей: {len(articles_data)}")
break
else:
if response.status_code == 404:
break
print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
time.sleep(5)
return self.get_proccesing_news(articles_data)
def clean_text(self, text):
text = re.sub(r'[^а-яА-Я\s]', '', text)
text = text.lower()
text = ' '.join([word for word in text.split() if len(word) >= 3])
text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
return text
def get_proccesing_news(self, articles_data):
# Обрабатываем полученные данные, добавляем сентиментальный анализ.
sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")
news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])
news['title'] = news['title'].ffill().map(self.clean_text)
news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
news = pd.get_dummies(news, columns=['sentiment'])
news['DATE'] = pd.to_datetime(news['DATE'])
news['day'] = news['DATE'].dt.day
news['year'] = news['DATE'].dt.year
news['month'] = news['DATE'].dt.month
news['hour'] = news['DATE'].dt.hour
return news