File size: 1,407 Bytes
679ee5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import pandas as pd
from cleantext import clean
from langchain_community.document_loaders import WebBaseLoader
from mlrun.execution import MLClientCtx
def handler(
context: MLClientCtx, data_set: str, num_samples: int = 10, random_state: int = 42
):
# Download raw data
df = pd.read_csv(data_set, sep=";")
# Get latest 200 articles by date
df["published_date"] = pd.to_datetime(df["published_date"])
latest_200 = df.sort_values(by="published_date").tail(200)
topics = latest_200["topic"].unique()
# Get the top 10 articles per topic (health, technology, entertainment, etc.)
dfs_per_topic = [
latest_200[latest_200["topic"] == t].sample(
n=num_samples, random_state=random_state
)
for t in topics
]
merged_df = pd.concat(dfs_per_topic).reset_index(drop=True)
# Scrape article content
urls = merged_df["link"].tolist()
loader = WebBaseLoader(web_paths=urls, continue_on_failure=True)
loader.requests_per_second = 2
docs = loader.aload()
# Add cleaned article content and description
merged_df["description"] = [d.metadata.get("description", None) for d in docs]
merged_df["page_content"] = [clean(d.page_content, lower=False) for d in docs]
# Log dataset
context.log_dataset("vector-db-dataset", df=merged_df, format="csv")
context.logger.info("Dataset dowloaded and logged")
|