import os import re import pandas as pd from urllib.parse import urlparse import logging logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger(__name__) class UTILS: def __init__(self): pass def split_text( self, text ): text = text.split(',') text = [t.strip() for t in text] return text def replace_newlines_and_spaces( self, text ): # Replace all newline characters with spaces text = text.replace("\n", " ") # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) return text def clean_df( self, df, dropna=True, fillna=False ): if fillna: df.fillna('', inplace=True) if dropna: df.dropna(inplace=True) # df = df[~df.isna()] df = df.drop_duplicates().reset_index(drop=True) return df def validate_url_format( self, urls, url_type='urls' ): valid_urls = [] for url in urls: result = urlparse(url) # Check if the url is valid if all([result.scheme, result.netloc]): # Online PDF urls should end with .pdf extension if url_type == 'online_pdf' and not url.endswith('.pdf'): continue valid_urls.append(url) logging.info(f'Valid URLs are: {valid_urls}') return valid_urls