EddyGiusepe's picture
Usando LangChain para RAG e com Websites
6ac8795
"""
Instalar os pacotes, assim:
$ poetry install
"""
import time
import pandas as pd
from tqdm import tqdm
from trafilatura.sitemaps import sitemap_search
from trafilatura import fetch_url, extract, extract_metadata
def get_urls_from_sitemap(resource_url: str) -> list:
"""
Função que cria um DataFrame Pandas de URLs e artigos.
"""
urls = sitemap_search(resource_url)
return urls
def extract_article(url: str) -> dict:
"""
Estrae un articolo da una URL con Trafilatura
"""
downloaded = fetch_url(url)
article = extract(downloaded, favor_precision=True, only_with_metadata=True)
metadata = extract_metadata(downloaded)
return article, metadata
def create_dataset(list_of_websites: list) -> pd.DataFrame:
"""
Funzione che crea un DataFrame Pandas di URL e articoli.
"""
data = []
for website in tqdm(list_of_websites, desc="Websites"):
urls = get_urls_from_sitemap(website)
for url in tqdm(urls, desc="URLs"):
article, metadata = extract_article(url)
d = {
"url": url,
"article": article,
"title": metadata.title,
"description": metadata.description,
"author": metadata.author,
"date": metadata.date,
}
data.append(d)
time.sleep(0.5)
df = pd.DataFrame(data)
df = df.drop_duplicates()
df = df.dropna()
return df
if __name__ == "__main__":
list_of_websites = [
"https://www.diariodiunanalista.it/",
]
df = create_dataset(list_of_websites)
df.to_csv("./data/articles.csv", index=False)