bupa1018 commited on
Commit
6df5c93
·
1 Parent(s): c22947c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +256 -0
app.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gooodo shiet
2
+
3
+
4
+
5
+ import os
6
+ import json
7
+ import gradio as gr
8
+ import zipfile
9
+ import tempfile
10
+ import requests
11
+ import urllib.parse
12
+ import io
13
+
14
+ from huggingface_hub import HfApi, login
15
+ #from PyPDF2 import PdfReader
16
+ #from langchain_huggingface import HuggingFaceEmbeddings
17
+ #from langchain_community.vectorstores import Chroma
18
+ #from langchain.text_splitter import RecursiveCharacterTextSplitter
19
+ #from langchain_groq import ChatGroq
20
+ from dotenv import load_dotenv
21
+ #from langchain.docstore.document import Document
22
+
23
+ # Load environment variables from .env file
24
+ load_dotenv()
25
+
26
+ # Load configuration from JSON file
27
+ with open('config.json') as config_file:
28
+ config = json.load(config_file)
29
+
30
+
31
+ PERSIST_DIRECTORY = config["persist_directory"]
32
+ CHUNK_SIZE = config["chunk_size"]
33
+ CHUNK_OVERLAP = config["chunk_overlap"]
34
+ EMBEDDING_MODEL_NAME = config["embedding_model"]
35
+ LLM_MODEL_NAME = config["llm_model"]
36
+ LLM_TEMPERATURE = config["llm_temperature"]
37
+ GITLAB_API_URL = config["gitlab_api_url"]
38
+ HF_SPACE_NAME = config["hf_space_name"]
39
+ REPOSITORY_DIRECTORY = config["repository_directory"]
40
+
41
+ GROQ_API_KEY = os.environ["GROQ_API_KEY"]
42
+ HF_TOKEN = os.environ["HF_Token"]
43
+
44
+
45
+
46
+ login(HF_TOKEN)
47
+ api = HfApi()
48
+
49
+ def load_project_ids(json_file):
50
+ with open(json_file, 'r') as f:
51
+ data = json.load(f)
52
+ return data['project_ids']
53
+
54
+ def upload_gitRepository():
55
+ project_ids = load_project_ids('repository_ids.json')
56
+
57
+ for project_id in project_ids:
58
+ encoded_project_id = urllib.parse.quote_plus(project_id)
59
+ # Define the URL to download the repository archive
60
+ archive_url = f"{GITLAB_API_URL}/projects/{encoded_project_id}/repository/archive.zip"
61
+
62
+ # Download the repository archive
63
+ response = requests.get(archive_url)
64
+ archive_bytes = io.BytesIO(response.content)
65
+
66
+ # Retrieve the original file name from the response headers
67
+ content_disposition = response.headers.get('content-disposition')
68
+ if content_disposition:
69
+ filename = content_disposition.split('filename=')[-1].strip('\"')
70
+ else:
71
+ filename = 'archive.zip' # Fallback to a default name if not found
72
+
73
+ # Check if the file already exists in the repository
74
+ existing_files = api.list_repo_files(repo_id=HF_SPACE_NAME, repo_type='space')
75
+ target_path = f"{REPOSITORY_DIRECTORY}/{filename}"
76
+
77
+ if target_path in existing_files:
78
+ print(f"File '{target_path}' already exists in the repository. Skipping upload...")
79
+ continue
80
+
81
+ # Upload the ZIP file to the new folder in the Hugging Face space repository
82
+ api.upload_file(
83
+ path_or_fileobj=archive_bytes,
84
+ path_in_repo=target_path,
85
+ repo_id=HF_SPACE_NAME,
86
+ repo_type='space'
87
+ )
88
+
89
+ print(f"Repository '{project_id}' downloaded and uploaded to Hugging Face space '{HF_SPACE_NAME}' in folder '{DIRECTORY}' with original file name '{filename}'")
90
+
91
+ def process_directory(directory):
92
+ all_texts = []
93
+ file_references = []
94
+
95
+ if not os.path.exists(directory):
96
+ raise ValueError(f"Directory {directory} does not exist.")
97
+
98
+ # Find all zip files in the directory
99
+ zip_files = [file for file in os.listdir(directory) if file.endswith('.zip')]
100
+
101
+ if not zip_files:
102
+ print("No zip files found in the directory.")
103
+ else:
104
+ for zip_filename in zip_files:
105
+ zip_file_path = os.path.join(directory, zip_filename)
106
+
107
+ # Create a temporary directory for each zip file
108
+ with tempfile.TemporaryDirectory() as tmpdirname:
109
+ # Unzip the file into the temporary directory
110
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
111
+ zip_ref.extractall(tmpdirname)
112
+
113
+ # Process the temporary directory
114
+ temp_texts, temp_references = process_directory(tmpdirname)
115
+ all_texts.extend(temp_texts)
116
+ file_references.extend(temp_references)
117
+
118
+ for root, _, files in os.walk(directory):
119
+ for file in files:
120
+ file_path = os.path.join(root, file)
121
+ file_ext = os.path.splitext(file_path)[1]
122
+
123
+ if os.path.getsize(file_path) == 0:
124
+ print(f"Skipping an empty file: {file_path}")
125
+ continue
126
+
127
+ with open(file_path, 'rb') as f:
128
+ if file_ext in ['.rst', '.md', '.txt', '.html', '.json', '.yaml']:
129
+ text = f.read().decode('utf-8')
130
+ elif file_ext == '.pdf':
131
+ reader = PdfReader(f)
132
+ text = ""
133
+ for page in reader.pages:
134
+ text += page.extract_text()
135
+ elif file_ext in ['.svg']:
136
+ text = f"SVG file content from {file_path}"
137
+ elif file_ext in ['.png', '.ico']:
138
+ text = f"Image metadata from {file_path}"
139
+ else:
140
+ continue
141
+
142
+ all_texts.append(text)
143
+ file_references.append(file_path)
144
+
145
+ return all_texts, file_references
146
+
147
+
148
+
149
+
150
+ # Split text into chunks
151
+ def split_into_chunks(texts, references, chunk_size, chunk_overlap):
152
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
153
+ chunks = []
154
+
155
+ for text, reference in zip(texts, references):
156
+ chunks.extend([Document(page_content=chunk, metadata={"source": reference}) for chunk in text_splitter.split_text(text)])
157
+
158
+ return chunks
159
+
160
+ # Setup Chroma
161
+ def setup_chroma(chunks, model_name="sentence-transformers/all-mpnet-base-v2", persist_directory="chroma_data"):
162
+ embedding_model = HuggingFaceEmbeddings(model_name=model_name)
163
+ vectorstore = Chroma.from_documents(chunks, embedding=embedding_model, persist_directory=persist_directory)
164
+ return vectorstore
165
+
166
+ # Setup LLM
167
+ def setup_llm(model_name, temperature, api_key):
168
+ llm = ChatGroq(model=model_name, temperature=temperature, api_key=api_key)
169
+ return llm
170
+
171
+ def query_chroma(vectorstore, query, k):
172
+ results = vectorstore.similarity_search(query, k=k)
173
+ chunks_with_references = [(result.page_content, result.metadata["source"]) for result in results]
174
+ # Print the chosen chunks and their sources to the console
175
+ print("\nChosen chunks and their sources for the query:")
176
+ for chunk, source in chunks_with_references:
177
+ print(f"Source: {source}\nChunk: {chunk}\n")
178
+ print("-" * 50)
179
+ return chunks_with_references
180
+
181
+ def rag_workflow(query):
182
+ docs = query_chroma(vectorstore, query, k=5)
183
+ context = "\n\n".join([doc for doc, _ in docs])
184
+ references = "\n".join([f"[{i+1}] {ref}" for i, (_, ref) in enumerate(docs)])
185
+ print(f"Context for the query:\n{context}\n")
186
+ print(f"References for the query:\n{references}\n")
187
+ prompt = f"You are an intelligent AI assistant who is very good in giving answers for anything asked or instructed by the user. Provide a clear and concise answer based only on the pieces of retrieved context. You must follow this very strictly, do not use anything else other than the retrieved context. If no related Information is found from the context, reply that you do not know. \n\nContext:\n{context}\n\nQuery: {query}"
188
+
189
+
190
+ response = llm.invoke(prompt)
191
+ return response.content, references
192
+
193
+
194
+ def initialize():
195
+ global vectorstore, chunks, llm
196
+
197
+ upload_gitRepository()
198
+ all_texts, file_references = process_directory(REPOSITORY_DIRECTORY)
199
+
200
+ for i, text in enumerate(all_texts):
201
+ print(f"Content of file {i+1}:\n")
202
+ print(text)
203
+ print("\n" + "-"*40 + "\n")
204
+
205
+ #chunks = split_into_chunks(all_texts, file_references, CHUNK_SIZE, CHUNK_OVERLAP)
206
+ #vectorstore = setup_chroma(chunks, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY)
207
+ #llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)
208
+
209
+ # Gradio utils
210
+ def check_input_text(text):
211
+ if not text:
212
+ gr.Warning("Please input a question.")
213
+ raise TypeError
214
+ return True
215
+
216
+ def add_text(history, text):
217
+ history = history + [(text, None)]
218
+ yield history, ""
219
+
220
+ def bot_kadi(history):
221
+ user_query = history[-1][0]
222
+ response, references = rag_workflow(user_query)
223
+ history[-1] = (user_query, response)
224
+
225
+ # Format references for display with text passages
226
+ formatted_references = ""
227
+ docs = query_chroma(vectorstore, user_query, k=5)
228
+ for i, (doc, ref) in enumerate(docs):
229
+ formatted_references += f"""
230
+ <div style="border: 1px solid #ddd; padding: 10px; margin-bottom: 10px; border-radius: 5px;">
231
+ <h3 style="margin-top: 0;">Reference {i+1}</h3>
232
+ <p><strong>Source:</strong> {ref}</p>
233
+ <button onclick="var elem = document.getElementById('text-{i}'); var button = this; if (elem.style.display === 'block') {{ elem.style.display = 'none'; button.innerHTML = '&#9654; show source text'; }} else {{ elem.style.display = 'block'; button.innerHTML = '&#9660; hide source text'; }}">{{'&#9654; show source text'}}</button>
234
+ <div id="text-{i}" style="display: none;">
235
+ <p><strong>Text:</strong> {doc}</p>
236
+ </div>
237
+ </div>
238
+ """
239
+
240
+ yield history, formatted_references
241
+
242
+ def main():
243
+
244
+ def greet(name, intensity):
245
+ return "Hello, " + name + "!" * int(intensity)
246
+
247
+ demo = gr.Interface(
248
+ fn=greet,
249
+ inputs=["text", "slider"],
250
+ outputs=["text"],
251
+ )
252
+
253
+ demo.launch
254
+
255
+ initialize()
256
+ main()