Spaces:

KotVasily
/

myfirspace

Sleeping

App Files Files Community

myfirspace / news.py

KotVasily

Update news.py

c028831 verified 9 months ago

raw

history blame contribute delete

4.64 kB

	import requests
	import pandas as pd
	import time
	import pymorphy2
	import re
	import cloudscraper

	from datetime import datetime, timedelta
	from transformers import pipeline
	from bs4 import BeautifulSoup

	######

	class NewsData:
	def __init__(self) -> None:
	"""
	Парсер статей с ru.investing.com.
	"""
	self.urls = [
	("https://ru.investing.com/news/forex-news/", "forex-news"),
	("https://ru.investing.com/news/commodities-news/", "commodities-news"),
	]
	self.headers = self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
	'Accept-Encoding': 'gzip, deflate, br',
	'Connection': 'keep-alive',
	'Referer': 'https://www.google.com/'
	}

	self.proxies = {
	"http": "http://82.146.37.145:80",
	"https": "https://82.146.37.145:80"
	}


	self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
	self.scraper = cloudscraper.create_scraper()

	def get_data(self):
	# Парсим сайты за последнии 7 дней
	date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
	articles_data = []

	for base_url, tag in self.urls:
	page = 1
	while True:
	url = f"{base_url}{page}/"
	#response = requests.get(url, headers=self.headers)
	response = self.scraper.get(url, headers=self.headers, proxies=self.proxies)
	print(response)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	articles = soup.find_all("a", attrs={"data-test": "article-title-link"})

	daily_count = 0

	for article in articles:
	title = article.text.strip()
	date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"})
	url = article["href"]

	if date_tag:
	publish_date_str = date_tag["datetime"]
	publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")

	articles_data.append([title, publish_date_str, tag, url])
	daily_count += 1

	page += 1

	if publish_date < date_limit:
	print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
	print(f"Общее количество скачанных статей: {len(articles_data)}")
	break
	else:
	if response.status_code == 404:
	break

	print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
	time.sleep(5)

	return self.get_proccesing_news(articles_data)

	def clean_text(self, text):
	text = re.sub(r'[^а-яА-Я\s]', '', text)
	text = text.lower()
	text = ' '.join([word for word in text.split() if len(word) >= 3])
	text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
	return text

	def get_proccesing_news(self, articles_data):
	# Обрабатываем полученные данные, добавляем сентиментальный анализ.
	sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")

	news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])

	news['title'] = news['title'].ffill().map(self.clean_text)
	news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
	news = pd.get_dummies(news, columns=['sentiment'])

	news['DATE'] = pd.to_datetime(news['DATE'])
	news['day'] = news['DATE'].dt.day
	news['year'] = news['DATE'].dt.year
	news['month'] = news['DATE'].dt.month
	news['hour'] = news['DATE'].dt.hour
	return news