Spaces:
Sleeping
Sleeping
File size: 4,639 Bytes
d39b93e a4742d0 d39b93e 2712ce8 d39b93e 033b00b d39b93e 033b00b c028831 033b00b d39b93e a4742d0 d39b93e a4742d0 033b00b d00cf91 a4742d0 d39b93e a4742d0 d39b93e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import requests
import pandas as pd
import time
import pymorphy2
import re
import cloudscraper
from datetime import datetime, timedelta
from transformers import pipeline
from bs4 import BeautifulSoup
######
class NewsData:
def __init__(self) -> None:
"""
Парсер статей с ru.investing.com.
"""
self.urls = [
("https://ru.investing.com/news/forex-news/", "forex-news"),
("https://ru.investing.com/news/commodities-news/", "commodities-news"),
]
self.headers = self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://www.google.com/'
}
self.proxies = {
"http": "http://82.146.37.145:80",
"https": "https://82.146.37.145:80"
}
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
self.scraper = cloudscraper.create_scraper()
def get_data(self):
# Парсим сайты за последнии 7 дней
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
articles_data = []
for base_url, tag in self.urls:
page = 1
while True:
url = f"{base_url}{page}/"
#response = requests.get(url, headers=self.headers)
response = self.scraper.get(url, headers=self.headers, proxies=self.proxies)
print(response)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
daily_count = 0
for article in articles:
title = article.text.strip()
date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"})
url = article["href"]
if date_tag:
publish_date_str = date_tag["datetime"]
publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")
articles_data.append([title, publish_date_str, tag, url])
daily_count += 1
page += 1
if publish_date < date_limit:
print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
print(f"Общее количество скачанных статей: {len(articles_data)}")
break
else:
if response.status_code == 404:
break
print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
time.sleep(5)
return self.get_proccesing_news(articles_data)
def clean_text(self, text):
text = re.sub(r'[^а-яА-Я\s]', '', text)
text = text.lower()
text = ' '.join([word for word in text.split() if len(word) >= 3])
text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
return text
def get_proccesing_news(self, articles_data):
# Обрабатываем полученные данные, добавляем сентиментальный анализ.
sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")
news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])
news['title'] = news['title'].ffill().map(self.clean_text)
news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
news = pd.get_dummies(news, columns=['sentiment'])
news['DATE'] = pd.to_datetime(news['DATE'])
news['day'] = news['DATE'].dt.day
news['year'] = news['DATE'].dt.year
news['month'] = news['DATE'].dt.month
news['hour'] = news['DATE'].dt.hour
return news
|