MaxGit32 commited on
Commit
dc5c877
·
1 Parent(s): 787205f

Upload llm-version2.py

Browse files
Files changed (1) hide show
  1. llm-version2.py +98 -0
llm-version2.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
6
+ import os
7
+ from PyPDF2 import PdfReader
8
+ from langchain.chains import RetrievalQAWithSourcesChain
9
+ from langchain.memory import ConversationBufferMemory
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ #from htmlTemplates import css, bot_template, user_template
12
+ from langchain.llms import HuggingFaceHub
13
+ from dotenv import load_dotenv
14
+ ###########
15
+ #pip install faiss-cpu
16
+ #pip install langchain
17
+ #pip install pypdf
18
+ #pip tiktoken
19
+ #pip install InstructorEmbedding
20
+ ###############
21
+
22
+ # PDF in String umwandeln
23
+ def get_pdf_text(folder_path):
24
+ text = ""
25
+ # Durchsuche alle Dateien im angegebenen Verzeichnis
26
+ for filename in os.listdir(folder_path):
27
+ filepath = os.path.join(folder_path, filename)
28
+
29
+ # Überprüfe, ob die Datei die Erweiterung ".pdf" hat
30
+ if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
31
+ pdf_reader = PdfReader(filepath)
32
+ for page in pdf_reader.pages:
33
+ text += page.extract_text()
34
+ #text += '\n'
35
+
36
+ return text
37
+
38
+ #Chunks erstellen
39
+ def get_text_chunks(text):
40
+ #Arbeitsweise Textsplitter definieren
41
+ text_splitter = CharacterTextSplitter(
42
+ separator="\n",
43
+ chunk_size=1000,
44
+ chunk_overlap=200,
45
+ length_function=len
46
+ )
47
+ chunks = text_splitter.split_text(text)
48
+ return chunks
49
+
50
+ # nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
51
+ def create_vectorstore_and_store(text_chunks):
52
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
53
+ # Initiate Faiss DB
54
+ vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
55
+ ###
56
+ ### --> danach soll das PDF-Verzeichnis gelöscht werden, bzw. Datein verschieben, weil beim nächsten Upload
57
+ ###
58
+ # Verzeichnis in dem die VektorDB gespeichert werden soll
59
+ save_directory = "Store"
60
+ #VektorDB lokal speichern
61
+ vectorstoreDB.save_local(save_directory)
62
+ print(vectorstoreDB)
63
+ return None
64
+
65
+ ########
66
+
67
+ def get_vectorstore():
68
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
69
+ #Abruf lokaler Vektordatenbank
70
+ save_directory = "Store"
71
+ vectorstoreDB = FAISS.load_local(save_directory, embeddings)
72
+ return vectorstoreDB
73
+
74
+
75
+ def main():
76
+ load_dotenv()
77
+ user_question = "Wie lautet deine Frage?"
78
+ folder_path = './PDFs'
79
+ pdf_text = get_pdf_text(folder_path)
80
+ text_chunks = get_text_chunks(pdf_text)
81
+ #conversation = get_conversation_chain(get_vectorstore())
82
+ retriever=get_vectorstore().as_retriever()
83
+ retrieved_docs=retriever.invoke(
84
+ "Was macht man im Katastrophenfall?"
85
+ )
86
+ print(retrieved_docs[0].page_content)
87
+ #create_vectorstore_and_store(text_chunks) # bei incoming pdf
88
+
89
+ #vectorstore_DB=get_vectorstore() # bei Abfrage durch Chatbot
90
+ #print(get_vectorstore().similarity_search_with_score("stelle")) # zeigt an ob Vektordatenbank gefüllt ist
91
+
92
+ #print(get_conversation_chain(get_vectorstore()))
93
+
94
+
95
+
96
+
97
+ if __name__ == '__main__':
98
+ main()