EddyGiusepe commited on
Commit
b4e87ef
1 Parent(s): 5a7b133

Duas funções: uma cria o DB e outra Delete o DB

Browse files
Files changed (1) hide show
  1. functions.py +67 -0
functions.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Scientist.: Dr. Eddy Giusepe Chirinos Isidro
4
+
5
+ Execução deste script
6
+ =====================
7
+ $ python functions.py
8
+ """
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,
12
+ )
13
+ from langchain_chroma import Chroma
14
+
15
+ import warnings
16
+ warnings.filterwarnings('ignore')
17
+
18
+ import shutil
19
+ import os
20
+
21
+
22
+ # Criando o Database:
23
+ def create_db():
24
+
25
+ loader = PyPDFLoader("files/CHATGPT Granularity_clustering.pdf")
26
+ pages = loader.load()
27
+ #print(len(pages))
28
+
29
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
30
+ chunk_overlap=100,
31
+ length_function=len,
32
+ is_separator_regex=False
33
+ )
34
+
35
+ chunks = text_splitter.split_documents(pages)
36
+ #print(len(chunks))
37
+
38
+ ids = [str(i) for i in range(1, len(chunks) + 1)]
39
+ #print(ids)
40
+
41
+ # Crie a função de Embeddings de código aberto:
42
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2",
43
+ model_kwargs = {'device': 'cpu'}
44
+ )
45
+
46
+ # Crie o banco de dados Chroma com IDs:
47
+ Chroma.from_documents(documents=chunks,
48
+ embedding=embedding_function,
49
+ persist_directory="./chroma_db",
50
+ ids=ids
51
+ )
52
+
53
+
54
+ # Excluindo o banco de dados (Database):
55
+ def delete_persisted_db():
56
+ if "chroma_db" in os.listdir():
57
+ shutil.rmtree("chroma_db")
58
+ print(f"Banco de dados excluído e seu conteúdo.")
59
+ else:
60
+ raise FileNotFoundError("Banco de dados não encontrado.")
61
+
62
+
63
+
64
+ # Testando o funcionamento deste script:
65
+ if __name__ == "__main__":
66
+ create_db()
67
+