Chintan Donda
KKMS Kisan Smart Search Demo App and their scripts
5273d83
raw
history blame
1.61 kB
import os
import re
import pandas as pd
from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
class UTILS:
def __init__(self):
pass
def split_text(
self,
text
):
text = text.split(',')
text = [t.strip() for t in text]
return text
def replace_newlines_and_spaces(
self,
text
):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def clean_df(
self,
df,
dropna=True,
fillna=False
):
if fillna:
df.fillna('', inplace=True)
if dropna:
df.dropna(inplace=True)
# df = df[~df.isna()]
df = df.drop_duplicates().reset_index(drop=True)
return df
def validate_url_format(
self,
urls,
url_type='urls'
):
valid_urls = []
for url in urls:
result = urlparse(url)
# Check if the url is valid
if all([result.scheme, result.netloc]):
# Online PDF urls should end with .pdf extension
if url_type == 'online_pdf' and not url.endswith('.pdf'):
continue
valid_urls.append(url)
logging.info(f'Valid URLs are: {valid_urls}')
return valid_urls