Spaces:
No application file
No application file
""" | |
Instalar os pacotes, assim: | |
$ poetry install | |
""" | |
import time | |
import pandas as pd | |
from tqdm import tqdm | |
from trafilatura.sitemaps import sitemap_search | |
from trafilatura import fetch_url, extract, extract_metadata | |
def get_urls_from_sitemap(resource_url: str) -> list: | |
""" | |
Função que cria um DataFrame Pandas de URLs e artigos. | |
""" | |
urls = sitemap_search(resource_url) | |
return urls | |
def extract_article(url: str) -> dict: | |
""" | |
Estrae un articolo da una URL con Trafilatura | |
""" | |
downloaded = fetch_url(url) | |
article = extract(downloaded, favor_precision=True, only_with_metadata=True) | |
metadata = extract_metadata(downloaded) | |
return article, metadata | |
def create_dataset(list_of_websites: list) -> pd.DataFrame: | |
""" | |
Funzione che crea un DataFrame Pandas di URL e articoli. | |
""" | |
data = [] | |
for website in tqdm(list_of_websites, desc="Websites"): | |
urls = get_urls_from_sitemap(website) | |
for url in tqdm(urls, desc="URLs"): | |
article, metadata = extract_article(url) | |
d = { | |
"url": url, | |
"article": article, | |
"title": metadata.title, | |
"description": metadata.description, | |
"author": metadata.author, | |
"date": metadata.date, | |
} | |
data.append(d) | |
time.sleep(0.5) | |
df = pd.DataFrame(data) | |
df = df.drop_duplicates() | |
df = df.dropna() | |
return df | |
if __name__ == "__main__": | |
list_of_websites = [ | |
"https://www.diariodiunanalista.it/", | |
] | |
df = create_dataset(list_of_websites) | |
df.to_csv("./data/articles.csv", index=False) | |