File size: 4,639 Bytes
d39b93e
 
 
 
 
a4742d0
d39b93e
 
 
 
 
2712ce8
 
d39b93e
 
 
 
 
 
 
 
 
033b00b
 
 
 
 
 
d39b93e
 
033b00b
c028831
 
033b00b
 
 
d39b93e
a4742d0
 
d39b93e
 
 
 
 
 
 
 
 
a4742d0
033b00b
d00cf91
a4742d0
d39b93e
a4742d0
d39b93e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
import pandas as pd
import time
import pymorphy2
import re 
import cloudscraper

from datetime import datetime, timedelta
from transformers import pipeline
from bs4 import BeautifulSoup

######

class NewsData:
    def __init__(self) -> None:
        """
        Парсер статей с ru.investing.com.
        """
        self.urls = [
            ("https://ru.investing.com/news/forex-news/", "forex-news"),
            ("https://ru.investing.com/news/commodities-news/", "commodities-news"),
        ]
        self.headers = self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Referer': 'https://www.google.com/'
        }

        self.proxies = {
            "http": "http://82.146.37.145:80",
            "https": "https://82.146.37.145:80"
        }


        self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
        self.scraper = cloudscraper.create_scraper()
        
    def get_data(self):
        # Парсим сайты за последнии 7 дней
        date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
        articles_data = []

        for base_url, tag in self.urls:
            page = 1
            while True:
                url = f"{base_url}{page}/" 
                #response = requests.get(url, headers=self.headers)
                response = self.scraper.get(url, headers=self.headers, proxies=self.proxies)
                print(response)
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, "html.parser")
                    articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
                    
                    daily_count = 0

                    for article in articles:
                        title = article.text.strip()
                        date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"}) 
                        url = article["href"]

                        if date_tag:
                            publish_date_str = date_tag["datetime"]
                            publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")
                            
                            articles_data.append([title, publish_date_str, tag, url])
                            daily_count += 1
                       
                    page += 1

                    if publish_date < date_limit:
                        print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
                        print(f"Общее количество скачанных статей: {len(articles_data)}")
                        break
                else:
                    if response.status_code == 404:
                        break

                    print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
                    time.sleep(5)

        return self.get_proccesing_news(articles_data)
    
    def clean_text(self, text):
        text = re.sub(r'[^а-яА-Я\s]', '', text)  
        text = text.lower()  
        text = ' '.join([word for word in text.split() if len(word) >= 3]) 
        text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
        return text
    
    def get_proccesing_news(self, articles_data):
        # Обрабатываем полученные данные, добавляем сентиментальный анализ.
        sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")

        news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])
        
        news['title'] = news['title'].ffill().map(self.clean_text)
        news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
        news = pd.get_dummies(news, columns=['sentiment'])

        news['DATE'] = pd.to_datetime(news['DATE'])
        news['day'] = news['DATE'].dt.day
        news['year'] = news['DATE'].dt.year
        news['month'] = news['DATE'].dt.month
        news['hour'] = news['DATE'].dt.hour
        return news