EddyGiusepe commited on
Commit
6ac8795
·
1 Parent(s): 28c581e

Usando LangChain para RAG e com Websites

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. poetry.lock +0 -0
  3. pyproject.toml +26 -0
  4. src/modules/scraper.py +66 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # EddyGiusepe
2
+ venv_chatbotRAG/
3
+ .env
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "ddua-embeddings"
3
+ version = "0.1.0"
4
+ description = "This project is a chatbot application that LangChain, OpenAI and FAISS to talk to a the blog Diario Di Un Analista.it"
5
+ authors = ["Andrea D'Agostino <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [ { include = "src", from = "." } ]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = ">=3.10,<4.0"
11
+ streamlit = "^1.28.2"
12
+ langchain = "^0.0.339"
13
+ openai = "^1.3.5"
14
+ trafilatura = "^1.6.2"
15
+ python-dotenv = "^1.0.0"
16
+ pandas = "^2.1.3"
17
+ ruff = "^0.1.6"
18
+ tiktoken = "^0.5.1"
19
+ instructorembedding = "^1.0.1"
20
+ faiss-cpu = "^1.7.4"
21
+ watchdog = "^3.0.0"
22
+
23
+
24
+ [build-system]
25
+ requires = ["poetry-core"]
26
+ build-backend = "poetry.core.masonry.api"
src/modules/scraper.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ Instalar os pacotes, assim:
4
+
5
+ $ poetry install
6
+ """
7
+ import time
8
+ import pandas as pd
9
+ from tqdm import tqdm
10
+ from trafilatura.sitemaps import sitemap_search
11
+ from trafilatura import fetch_url, extract, extract_metadata
12
+
13
+
14
+ def get_urls_from_sitemap(resource_url: str) -> list:
15
+ """
16
+ Função que cria um DataFrame Pandas de URLs e artigos.
17
+ """
18
+ urls = sitemap_search(resource_url)
19
+ return urls
20
+
21
+
22
+ def extract_article(url: str) -> dict:
23
+ """
24
+ Estrae un articolo da una URL con Trafilatura
25
+ """
26
+ downloaded = fetch_url(url)
27
+ article = extract(downloaded, favor_precision=True, only_with_metadata=True)
28
+ metadata = extract_metadata(downloaded)
29
+ return article, metadata
30
+
31
+
32
+ def create_dataset(list_of_websites: list) -> pd.DataFrame:
33
+ """
34
+ Funzione che crea un DataFrame Pandas di URL e articoli.
35
+ """
36
+ data = []
37
+ for website in tqdm(list_of_websites, desc="Websites"):
38
+ urls = get_urls_from_sitemap(website)
39
+ for url in tqdm(urls, desc="URLs"):
40
+ article, metadata = extract_article(url)
41
+ d = {
42
+ "url": url,
43
+ "article": article,
44
+ "title": metadata.title,
45
+ "description": metadata.description,
46
+ "author": metadata.author,
47
+ "date": metadata.date,
48
+ }
49
+ data.append(d)
50
+ time.sleep(0.5)
51
+
52
+ df = pd.DataFrame(data)
53
+ df = df.drop_duplicates()
54
+ df = df.dropna()
55
+
56
+ return df
57
+
58
+
59
+ if __name__ == "__main__":
60
+ list_of_websites = [
61
+ "https://www.diariodiunanalista.it/",
62
+ ]
63
+
64
+ df = create_dataset(list_of_websites)
65
+
66
+ df.to_csv("./data/articles.csv", index=False)