Spaces:

kristada673
/

roboadvisor

Runtime error

+from finnlp.data_sources._base import FinNLP_Downloader
+class News_Downloader(FinNLP_Downloader):
+    def __init__(self, args = {}):
+        super().__init__(args)
+        pass
+    def download_date_range(self, start_date, end_date, stock = None):
+        pass
+    def download_streaming(self, stock = None):
+        pass
+    def clean_data(self):
+        pass
+    def _gather_one_part(self, date, stock = None, delay = 0.1):
+        pass
+    def _gather_content(self):
+        pass

finnlp/data_sources/news/akshare_cctv.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import pandas as pd
+import akshare as ak
+from tqdm.notebook import tqdm
+from finnlp.data_sources.news._base import News_Downloader
+class Akshare_cctv(News_Downloader):
+    def __init__(self, args={}):
+        pass
+    def download_news(self, start_date, end_date, stock="all"):
+        self.date_list = pd.date_range(start_date, end_date)
+        res = pd.DataFrame()
+        for date in tqdm(self.date_list):
+            tmp = self.gather_one_day_news(date)
+            res = pd.concat([res, tmp])
+        self.dataframe = res
+    def clean_data(self):
+        pass
+    def gather_one_day_news(self, date, stock="all", delay=0.1):
+        date = self.transfer_standard_date_to_nonstandard(date)
+        res = ak.news_cctv(date=date)
+        return res
+    def transfer_standard_date_to_nonstandard(self, date):
+        return date.strftime("%Y%m%d")

finnlp/data_sources/news/alliancenews_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/cnbc_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/eastmoney_streaming.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import requests
+from lxml import etree
+from tqdm import tqdm
+import pandas as pd
+from finnlp.data_sources.news._base import News_Downloader
+class Eastmoney_Streaming(News_Downloader):
+    def __init__(self, args={}):
+        super().__init__(args)
+        self.dataframe = pd.DataFrame()
+    def download_streaming_stock(self, stock = "600519", rounds = 3):
+        print( "Geting pages: ", end = "")
+        if rounds > 0:
+            for r in range(rounds):
+                br = self._gather_pages(stock, r)
+                if br == "break":
+                    break
+        else:
+            r = 1
+            error_count = 0
+            while 1:
+                br = self._gather_pages(stock, r)
+                if br == "break":
+                    break
+                elif br == "Error":
+                    error_count +=1
+                if error_count>10:
+                    print("Connection Error")
+                r += 1
+        print( f"Get total {r+1} pages.")
+        self.dataframe = self.dataframe.reset_index(drop = True)
+    def _gather_pages(self, stock, page):
+        print( page, end = " ")
+        url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+        }
+        requests.DEFAULT_RETRIES = 5  # 增加重试连接次数
+        s = requests.session()
+        s.keep_alive = False  # 关闭多余连接
+        response = self._request_get(url, headers=headers)
+        if response.status_code != 200:
+            return "Error"
+        # gather the comtent of the first page
+        page = etree.HTML(response.text)
+        trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr')
+        have_one = False
+        for item in trs:
+            have_one = True
+            read_amount = item.xpath("./td[1]//text()")[0]
+            comments = item.xpath("./td[2]//text()")[0]
+            title = item.xpath("./td[3]/div/a//text()")[0]
+            content_link = item.xpath("./td[3]/div/a/@href")[0]
+            author = item.xpath("./td[4]//text()")[0]
+            time = item.xpath("./td[5]//text()")[0]
+            tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T
+            columns = [ "read amount", "comments", "title", "content link", "author", "create time" ]
+            tmp.columns = columns
+            self.dataframe = pd.concat([self.dataframe, tmp])
+            #print(title)
+        if have_one == False:
+            return "break"

finnlp/data_sources/news/finnhub_date_range.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import warnings
+warnings.filterwarnings("ignore")
+from finnlp.data_sources.news._base import News_Downloader
+from tqdm import tqdm
+from lxml import etree
+import pandas as pd
+import requests
+import finnhub
+import time
+import json
+class Finnhub_Date_Range(News_Downloader):
+    def __init__(self, args = {}):
+        super().__init__(args)
+        assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard"
+        self.finnhub_client = finnhub.Client(api_key=args["token"])
+    def download_date_range_stock(self, start_date, end_date, stock = "AAPL"):
+        self.date_list = pd.date_range(start_date,end_date)
+        self.dataframe = pd.DataFrame()
+        days_each_time = 4
+        date_list = self.date_list
+        # cal total lenth
+        if len(date_list)%days_each_time == 0:
+            total = len(date_list)//days_each_time
+        else:
+            total = len(date_list)//days_each_time+1
+        with tqdm(total=total, desc= "Downloading Titles") as bar:
+            while len(date_list):
+                tmp_date_list = date_list[:days_each_time]
+                date_list = date_list[days_each_time:]
+                tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d")
+                tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d")
+                res = self._gather_one_part(tmp_start_date,tmp_end_date,stock = stock )
+                self.dataframe = pd.concat([self.dataframe,res])
+                bar.update(1)
+        # res  = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
+        self.dataframe.datetime = pd.to_datetime(self.dataframe.datetime,unit = "s")
+        self.dataframe = self.dataframe.reset_index(drop = True)
+    def _gather_one_part(self, start_date, end_date, stock = "AAPL", delay = 1):
+        res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date)
+        time.sleep(delay)
+        return pd.DataFrame(res)
+    def gather_content(self, delay = 0.01):
+        pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
+        self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
+    def _gather_content_apply(self,x, pbar, delay = 0.01):
+        time.sleep(delay)
+        url = x.url
+        source = x.source
+        response = self._request_get(url = url)
+        # response = self._request_get(url= url, headers= headers)
+        pbar.update(1)
+        if response is None:
+            return "Connection Error"
+        else:
+            page = etree.HTML(response.text)
+        try:
+            # Yahoo Finance
+            if source == "Yahoo":
+                page = page.xpath("/html/body/div[3]/div[1]/div/main/div[1]/div/div/div/div/article/div/div/div/div/div/div[2]/div[4]")
+                content = page[0].xpath(".//text()")
+                content = "\n".join(content)
+                return content
+            # Reuters
+            elif source == "Reuters":
+                page = page.xpath("/html/body/div[1]/div[3]/div/main/article/div[1]/div[2]/div/div/div[2]")
+                content = page[0].xpath(".//text()")
+                content = "\n".join(content)
+                return content
+            # SeekingAlpha
+            elif source == "SeekingAlpha":
+                page = page.xpath("/html/body/div[2]/div/div[1]/main/div/div[2]/div/article/div/div/div[2]/div/section[1]/div/div/div")
+                content = page[0].xpath(".//text()")
+                content = "\n".join(content)
+                return content
+            # PennyStocks
+            elif source == "PennyStocks":
+                page = page.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/article/div[2]/div[2]/div")
+                content = page[0].xpath(".//text()")
+                content = "\n".join(content)
+                return content
+            # MarketWatch
+            elif source == "MarketWatch":
+                page = page.xpath('//*[@id="js-article__body"]')
+                content = page[0].xpath(".//text()")
+                content = "".join(content)
+                while "  " in content:
+                    content = content.replace("  ", " ")
+                while "\n \n"in content:
+                    content = content.replace("\n \n", " ")
+                while "\n  "in content:
+                    content = content.replace("\n  ", " ")
+                return content
+            # Seeking Alpha
+            elif source == "Seeking Alpha":
+                # first get Seeking Alpha URL
+                page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
+                url_new = page[0]
+                response = self._request_get(url= url_new)
+                if response is None:
+                    return "Connection Error"
+                else:
+                    page = etree.HTML(response.text)
+                content = page[0].xpath(".//text()")
+                content = "\n".join(content)
+                return content
+            # Alliance News
+            elif source == "Alliance News":
+                page = page.xpath('//*[@id="comtext"]')
+                content = page[0].xpath(".//text()")
+                content = [c for c in content if not str(c).startswith("\r\n")]
+                content = "\n".join(content)
+                return content
+            # Thefly.com
+            elif source == "Thefly.com":
+                page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
+                url_new = page[0]
+                response = self._request_get(url= url_new, verify= False)
+                if response is None:
+                    return "Connection Error"
+                else:
+                    page = etree.HTML(response.text)
+                page = page.xpath('/html/body/div[2]/div/div/div/div/div[2]/div[2]//text()')
+                # content = page[0].xpath(".//text()")
+                # content = [c for c in content if not str(c).startswith("\r\n")]
+                content = "\n".join(page)
+                content = content.replace("\r\n","")
+                return content
+            # TalkMarkets
+            elif source == "TalkMarkets":
+                return "Not supported yet"
+            # CNBC
+            elif source == "CNBC":
+                page = page.xpath('/html/body/div[3]/div/div[1]/div[3]/div/div/div/div[3]/div[1]/div[2]/div[3]//text()')
+                content = "\n".join(page)
+                return content
+            # GuruFocus
+            elif source == "GuruFocus":
+                page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
+                url_new = page[0]
+                response = self._request_get(url= url_new)
+                if response is None:
+                    return "Connection Error"
+                else:
+                    page = etree.HTML(response.text)
+                page = page.xpath('/html/body/div[1]/div/section/section/main/section/main/div[1]/div/div/div[1]/div[2]/div//text()')
+                page_new = []
+                for c in page:
+                    while "\n"  in c:
+                        c = c.replace("\n","")
+                    while "  "in c:
+                        c = c.replace("  ","")
+                    page_new.append(c)
+                content = "\n".join(page_new)
+                return content
+            # InvestorPlace
+            elif source == "InvestorPlace":
+                page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
+                url_new = page[0]
+                response = self._request_get(url= url_new)
+                if response is None:
+                    return "Connection Error"
+                else:
+                    page = etree.HTML(response.text)
+                    page = page.xpath('//script[@type="application/ld+json"]')[1]
+                    content = page.xpath(".//text()")
+                    content = json.loads(content[0])
+                    content = content["articleBody"]
+                    return content
+            # TipRanks
+            elif source == "TipRanks":
+                page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href')
+                url_new = page[0]
+                response = self._request_get(url= url_new)
+                if response is None:
+                    return "Connection Error"
+                else:
+                    page = etree.HTML(response.text)
+                    # /html/body/div[1]/div[2]/div[5]/div[2]/div[2]/div/div[6]/div/article/p[1]/p
+                    page = page.xpath('/html/body/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/div[6]//text()')
+                    # content = page[0].xpath('.//text()')
+                    page = [p.replace("\n","") for p in page]
+                    content = "".join(page)
+                    return content
+            else:
+                return "Not supported yet"
+        except:
+            return "Error"

finnlp/data_sources/news/fmp_streaming.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json
+import requests
+import pandas as pd
+from tqdm.notebook import tqdm
+df = pd.read_csv("NAS.csv", index_col=0)
+stock_list = df.index.to_list()
+api_key = YOUR_API_KEY  # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys
+all = pd.DataFrame()
+for stock in tqdm(stock_list):
+    for page in tqdm(range(500)):
+        url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}"
+        res = requests.get(url)
+        res = json.loads(res.text)
+        if len(res) == 0:
+            break
+        else:
+            res = pd.DataFrame(res)
+            all = pd.concat([all, res])
+all = all.reset_index(drop=True)
+all.to_csv("dataset_more.csv")

finnlp/data_sources/news/gurufocus_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/investorplace_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/marketwatch_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/pennystocks_streaming.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import requests
+from lxml import etree
+from tqdm import tqdm
+import pandas as pd
+import json
+import time as time
+from finnlp.data_sources.news._base import News_Downloader
+# TODO:
+# 1. More Pages
+# 2. Contents
+class PennyStocks_Streaming(News_Downloader):
+    def __init__(self, args={}):
+        super().__init__(args)
+        self.dataframe = pd.DataFrame()
+    def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2):
+        # establish session
+        self._connect_session()
+        # download first page
+        self._download_first_page(keyword, delay = delay)
+        # download the following pages
+        # self._download_other_pages(keyword)
+        print("Only support the first page now!")
+    def _connect_session(self):
+        # since the server will check cookies, we need first
+        # request the main site withour cookies, then finish
+        # searching for the stock information we want.
+        self.session = requests.session()
+        first_url = "https://pennystocks.com/"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+        }
+        print("Requesting https://pennystocks.com ...", end = " ")
+        res = self.session.get(headers = headers, url = first_url)
+        if res.status_code !=200:
+            raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github")
+        print("succeed!")
+    def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2):
+        url = f"https://pennystocks.com/?s={keyword}"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+        }
+        res = self.session.get(url = url, headers = headers)
+        res = etree.HTML(res.text)
+        articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
+        # not sure why but this really works
+        while max_retry and len(articles) == 0:
+            import time
+            time.sleep(delay)
+            print("Gathering again ..", end = ' ')
+            res = requests.get(url = url, headers = headers, cookies=self.session.cookies)
+            res = etree.HTML(res.text)
+            articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article")
+            max_retry -= 1
+            print(f"Remaining Retry: {max_retry}")
+        for a in articles:
+            title = a.xpath("./header/h2/a//text()")[0]
+            time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0]
+            brief = a.xpath("./div[3]/div/div/text()")[0]
+            reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0]
+            columns = ["title", "time", "brief", "reading_time"]
+            tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns)
+            self.dataframe = pd.concat([self.dataframe, tmp])
+    def _download_other_pages(self, keyword = "apple"):
+        pass

finnlp/data_sources/news/reuters_streaming.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import requests
+from lxml import etree
+from tqdm import tqdm
+import pandas as pd
+import json
+import time
+from finnlp.data_sources.news._base import News_Downloader
+# TODO:
+# 1. Contents
+class Reuters_Streaming(News_Downloader):
+    def __init__(self, args={}):
+        super().__init__(args)
+        self.dataframe = pd.DataFrame()
+    def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
+        news_per_page = 20
+        url = "https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+            "Referer": "https://www.reuters.com/site-search/?query=AAPL&sort=newest&offset=0"
+        }
+        print( "Geting pages: ", end = "")
+        for i in range(rounds):
+            offset = i * news_per_page
+            params = {
+                "query": f'{{"keyword":"{keyword}","offset":{offset},"orderby":"display_date:desc","size":20,"website":"reuters"}}',
+                "d": "144",
+                "_website": "reuters",
+            }
+            response = self._request_get(url, headers=headers, params = params)
+            # check connection error
+            if response.status_code != 200:
+                return "Error"
+            # Phrase response
+            response = json.loads(response.text)
+            # check whether return content
+            if response["statusCode"] != 200:
+                print("Early Stopping")
+                break
+            # make pandas DataFrame
+            tmp = pd.DataFrame(response["result"]["articles"])
+            self.dataframe = pd.concat([self.dataframe, tmp])
+            # finish
+            print( i+1, end = " ")
+            time.sleep(delay)

finnlp/data_sources/news/seekingalpha_date_range.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import warnings
+warnings.filterwarnings("ignore")
+import json
+import requests
+import pandas as pd
+from lxml import etree
+from tqdm import tqdm
+from datetime import datetime
+from finnlp.data_sources.news._base import News_Downloader
+class SeekingAlpha_Date_Range(News_Downloader):
+    def __init__(self, args = {}):
+        super().__init__(args)
+    def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None):
+        self.dataframe = pd.DataFrame()
+        start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp())
+        end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp())
+        # Downloading First Page
+        data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies)
+        self.dataframe = pd.concat([self.dataframe, data])
+        # Downloading Other Pages
+        with tqdm(total=totalpages, desc= "Downloading Titles") as bar:
+            bar.update(1)
+            for page in range(2, totalpages+1):
+                data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies)
+                self.dataframe = pd.concat([self.dataframe, data])
+                bar.update(1)
+        self.dataframe = self.dataframe.reset_index(drop = True)
+    def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None):
+        url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}"
+        headers = {
+            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0',
+            'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z'
+        }
+        response = requests.get(url, headers=headers, proxies=proxies)
+        if response.status_code != 200:
+            print(f"stock: {stock}, page: {page} went wrong!")
+            return pd.DataFrame(), 1
+        else:
+            res = json.loads(response.text)
+            data = pd.DataFrame(res["data"])
+            # make new features
+            new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"]
+            data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" )
+            new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"]
+            data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" )
+            # total pages
+            totalpages = res["meta"]["page"]["totalPages"]
+            return data, totalpages

finnlp/data_sources/news/sina_finance_date_range.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import json
+import pytz
+import time
+import requests
+import pandas as pd
+import numpy as np
+from lxml import etree
+from tqdm import tqdm
+from finnlp.data_sources.news._base import News_Downloader
+class Sina_Finance_Date_Range(News_Downloader):
+    def __init__(self, args={}):
+        super().__init__(args)
+        self.dataframe = pd.DataFrame()
+    def download_date_range_all(self, start_date, end_date):
+        self.date_list = pd.date_range(start_date, end_date)
+        for date in tqdm(self.date_list, desc= "Downloading Titles..."):
+            tmp = self._gather_one_day(date)
+            self.dataframe = pd.concat([self.dataframe, tmp])
+        self.dataframe = self.dataframe.reset_index(drop = True)
+    def _gather_one_day(self, date, delay = 0.1):
+        end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp()
+        start_timestamp = end_timestamp - 60 * 60 * 24
+        res = pd.DataFrame()
+        for page in range(100):
+            url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}"
+            response = self._request_get(url = url)
+            if response is not None:
+                response.encoding = 'unicode'
+                text = response.text
+                text = json.loads(text, strict=True)
+                text = text["result"]
+                text = text["data"]
+                if len(text) == 0:
+                    break
+                for i in text:
+                    for ii in i.keys():
+                        i[ii] = [i[ii]]
+                    tmp = pd.DataFrame(i)
+                    res = pd.concat([res, tmp])
+                time.sleep(delay)
+        if res.shape[0] != 0:
+            res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True)
+            res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True)
+            res.intime = pd.to_datetime(res.intime, unit="s", utc=True)
+            tz = pytz.timezone("Asia/Shanghai")
+            res.ctime = [t.astimezone(tz) for t in res.ctime]
+            res.mtime = [t.astimezone(tz) for t in res.mtime]
+            res.intime = [t.astimezone(tz) for t in res.intime]
+        return res
+    def gather_content(self, delay = 0.01):
+        pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents")
+        self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1)
+    def _gather_content_apply(self,x, pbar, delay = 0.01):
+        url = x.url
+        response = self._request_get(url=url)
+        if response is not None:
+            # process
+            response.encoding = 'unicode'
+            text = response.text
+            page = etree.HTML(text)
+            page = page.xpath("//*[@id='artibody']/p")
+            page = [p.xpath(".//text()") for p in page]
+            page = [''.join(p) for p in page]
+            content = "\n".join(page)
+            content = content.replace("\u3000","")
+        else:
+            content = np.nan
+        # update
+        pbar.update(1)
+        time.sleep(delay)
+        return content

finnlp/data_sources/news/talkmarkets_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/thefly_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/tipranks_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/tushare_major_news.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import tushare as ts
+import pandas as pd
+from tqdm.notebook import tqdm
+from finnlp.data_sources.news._base import News_Downloader
+import time
+class Tushare_Major_News(News_Downloader):
+    def __init__(self, args = {}):
+        token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5"
+        ts.set_token(token)
+        self.pro = ts.pro_api()
+    def download_news(self, start_date, end_date, stock = "all"):
+        self.date_list = pd.date_range(start_date,end_date)
+        res = pd.DataFrame()
+        for date in tqdm(self.date_list):
+            tmp = self.gather_one_day_news(date)
+            res = pd.concat([res,tmp])
+        self.dataframe = res
+    def gather_one_day_news(self,date,stock = "all",delay = 0.1):
+        date = self.transfer_standard_date_to_nonstandard(date)
+        res = self.pro.major_news(start_date = date,end_date = date)
+        time.sleep(delay)
+        return res
+    def clean_data(self):
+        pass
+    def transfer_standard_date_to_nonstandard(self,date):
+        return date.strftime("%Y-%m0%d 00:00:00")

finnlp/data_sources/news/yahoo_streaming.py ADDED Viewed

File without changes

finnlp/data_sources/news/yicai_streaming.py ADDED Viewed

File without changes