Irakoze commited on
Commit
779eb61
1 Parent(s): 57c1b1a

Upload admin.py

Browse files

added admin file to embed data

Files changed (1) hide show
  1. admin.py +132 -0
admin.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import logging
4
+ from dotenv import load_dotenv
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain_community.vectorstores import Qdrant
7
+ from langchain.chains import RetrievalQA
8
+ from langchain_groq import ChatGroq
9
+ from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
10
+ from langchain_experimental.text_splitter import SemanticChunker
11
+ from qdrant_client import QdrantClient
12
+ from qdrant_client.http import models as rest
13
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
14
+ import gradio as gr
15
+
16
+ # Set up logging
17
+ logging.basicConfig(filename='app.log', level=logging.ERROR, format='%(asctime)s %(levelname)s %(message)s')
18
+ load_dotenv()
19
+
20
+ # Environment variables
21
+ api_key = os.getenv('API_KEY1')
22
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
23
+ qdurl = os.getenv("QDURL")
24
+
25
+ # Initialize Qdrant Client
26
+ try:
27
+ client = QdrantClient(
28
+ url=qdurl,
29
+ port=6333,
30
+ verify=False,
31
+ api_key=api_key,
32
+ )
33
+ collections = client.get_collections()
34
+ except Exception as e:
35
+ print("An error occurred: %s", e)
36
+ if "server engine not running" in str(e).lower():
37
+ print("The database engine is not running. Please check the server status.")
38
+ exit()
39
+
40
+ print("Database loaded")
41
+
42
+ # Initialize embeddings and database
43
+ hf = FastEmbedEmbeddings(model_name="nomic-ai/nomic-embed-text-v1.5-Q")
44
+
45
+ db = Qdrant(
46
+ client=client,
47
+ embeddings=hf,
48
+ collection_name="RR2"
49
+ )
50
+
51
+ load_vector_store = db
52
+ retriever = load_vector_store.as_retriever(search_kwargs={"k":3})
53
+ llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")
54
+
55
+ # Collection Management Functions
56
+ async def create_collection(url, port, collection_name, vector_size):
57
+ try:
58
+ client = QdrantClient(
59
+ url=url,
60
+ port=int(port),
61
+ api_key=api_key,
62
+ verify=False,
63
+ )
64
+ client.recreate_collection(
65
+ collection_name=collection_name,
66
+ vectors_config=rest.VectorParams(
67
+ size=int(vector_size),
68
+ distance=rest.Distance.COSINE,
69
+ )
70
+ )
71
+ return "Collection created successfully."
72
+ except Exception as e:
73
+ return f"Failed to create collection: {str(e)}"
74
+
75
+ # Data Processing Function
76
+ async def data_ingest_function(data_path, url, collection_name):
77
+ loop = asyncio.get_event_loop()
78
+ try:
79
+ def load_documents():
80
+ loader = PyPDFDirectoryLoader(data_path)
81
+ hf = FastEmbedEmbeddings(model_name="nomic-ai/nomic-embed-text-v1.5-Q")
82
+ text_splitter = SemanticChunker(hf, breakpoint_threshold_type="interquartile")
83
+ documents = loader.load_and_split(text_splitter=text_splitter)
84
+ return documents
85
+
86
+ texts = await loop.run_in_executor(None, load_documents)
87
+ print(f"Processed {len(texts)} text chunks")
88
+
89
+ def index_documents():
90
+ qdrant = Qdrant.from_documents(
91
+ texts,
92
+ hf,
93
+ url=url,
94
+ api_key=api_key,
95
+ collection_name=collection_name
96
+ )
97
+ return qdrant
98
+
99
+ await loop.run_in_executor(None, index_documents)
100
+ return "Data processing and indexing completed successfully."
101
+ except Exception as e:
102
+ return f"Failed to process data: {str(e)}"
103
+
104
+ # Gradio Admin Interface
105
+ with gr.Blocks(theme="soft", title="Admin LLM System", head="Admin for LARGE LANGUAGE MODEL SYSTEM") as admin:
106
+ with gr.Tab("Collection Management"):
107
+ with gr.Row():
108
+ url_input = gr.Textbox(label="Qdrant URL", value="")
109
+ port_input = gr.Number(label="Port", value=6333)
110
+ collection_name_input = gr.Textbox(label="Collection Name", value="RR2")
111
+ vector_size_input = gr.Number(label="Vector Size", value=768)
112
+ create_collection_btn = gr.Button("Create Collection")
113
+ create_collection_btn.click(
114
+ create_collection,
115
+ inputs=[url_input, port_input, collection_name_input, vector_size_input],
116
+ outputs=gr.Textbox(label="Result")
117
+ )
118
+
119
+ with gr.Row():
120
+ data_path_input = gr.Textbox(label="Data Folder Path")
121
+ url_processing_input = gr.Textbox(label="Qdrant URL for Processing", value="")
122
+ collection_name_processing_input = gr.Textbox(label="Collection Name for Processing", value="RR2")
123
+ start_processing_btn = gr.Button("Start Processing")
124
+ start_processing_btn.click(
125
+ data_ingest_function,
126
+ inputs=[data_path_input, url_processing_input, collection_name_processing_input],
127
+ outputs=gr.Textbox(label="Result")
128
+ )
129
+
130
+ # Launch Interface
131
+ if __name__ == "__main__":
132
+ admin.launch(server_name="0.0.0.0", server_port=7860, share=False)