bupa1018 commited on
Commit
151e771
·
1 Parent(s): 835b679

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -30
app.py CHANGED
@@ -90,7 +90,28 @@ def download_gitlab_repo():
90
  )
91
  print("Upload complete")
92
 
93
- def process_directory(directory):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  all_texts = []
95
  file_references = []
96
 
@@ -105,7 +126,7 @@ def process_directory(directory):
105
  return all_texts, file_references
106
  else:
107
  zip_file_path = os.path.join(directory, zip_files[0])
108
-
109
  # Create a temporary directory for the zip file
110
  with tempfile.TemporaryDirectory() as tmpdirname:
111
  # Unzip the file into the temporary directory
@@ -113,36 +134,39 @@ def process_directory(directory):
113
  zip_ref.extractall(tmpdirname)
114
  print(f"Extracted {zip_file_path} to {tmpdirname}")
115
 
116
- # Process the files in the temporary directory
117
- for root, _, files in os.walk(tmpdirname):
118
- for file in files:
119
- print(f"Any files??: {file}")
120
- file_path = os.path.join(root, file)
121
- file_ext = os.path.splitext(file_path)[1]
122
-
123
- if os.path.getsize(file_path) == 0:
124
- print(f"Skipping an empty file: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  continue
126
 
127
- with open(file_path, 'rb') as f:
128
- if file_ext in ['.rst', '.md', '.txt', '.html', '.json', '.yaml', '.py']:
129
- text = f.read().decode('utf-8')
130
- print(f"Extracted text from {file_path}:\n{text[:200]}...\n")
131
- elif file_ext in ['.svg']:
132
- text = f"SVG file content from {file_path}"
133
- elif file_ext in ['.png', '.ico']:
134
- text = f"Image metadata from {file_path}"
135
- else:
136
- continue
137
-
138
- all_texts.append(text)
139
- file_references.append(file_path)
140
-
141
- print(f"All extracted texts:\n{all_texts}")
142
- return all_texts, file_references
143
-
144
-
145
 
 
146
 
147
  # Split text into chunks
148
  def split_into_chunks(texts, references, chunk_size, chunk_overlap):
@@ -202,7 +226,9 @@ def initialize():
202
  global vectorstore, chunks, llm
203
 
204
  download_gitlab_repo()
205
- all_texts, file_references = process_directory(REPOSITORY_DIRECTORY)
 
 
206
  chunks = split_into_chunks(all_texts, file_references, CHUNK_SIZE, CHUNK_OVERLAP)
207
  vectorstore = setup_chroma(chunks, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY)
208
  llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)
 
90
  )
91
  print("Upload complete")
92
 
93
+ def get_all_files_in_folder(temp_dir, partial_paths):
94
+ all_files = []
95
+
96
+ for partial_path in partial_paths:
97
+ target_dir = os.path.join(temp_dir, partial_path)
98
+ print(target_dir)
99
+
100
+ for root, _, files in os.walk(target_dir):
101
+ print(f"Files in current directory ({root}): {files}")
102
+ for file in files:
103
+ print(f"Processing file: {file}")
104
+ all_files.append(os.path.join(root, file))
105
+
106
+ return all_files
107
+
108
+
109
+ def get_file(temp_dir, file_path):
110
+ full_path = os.path.join(temp_dir, file_path)
111
+ return full_path
112
+
113
+
114
+ def process_directory(directory, partial_paths=None, file_paths=None):
115
  all_texts = []
116
  file_references = []
117
 
 
126
  return all_texts, file_references
127
  else:
128
  zip_file_path = os.path.join(directory, zip_files[0])
129
+
130
  # Create a temporary directory for the zip file
131
  with tempfile.TemporaryDirectory() as tmpdirname:
132
  # Unzip the file into the temporary directory
 
134
  zip_ref.extractall(tmpdirname)
135
  print(f"Extracted {zip_file_path} to {tmpdirname}")
136
 
137
+ if partial_paths:
138
+ files = get_all_files_in_folder(tmpdirname, partial_paths)
139
+ else:
140
+ files = []
141
+ for root, _, files_list in os.walk(tmpdirname):
142
+ for file in files_list:
143
+ files.append(os.path.join(root, file))
144
+
145
+ if file_paths:
146
+ files += [get_file(tmpdirname, file_path) for file_path in file_paths]
147
+
148
+ for file_path in files:
149
+ file_ext = os.path.splitext(file_path)[1]
150
+
151
+ if os.path.getsize(file_path) == 0:
152
+ print(f"Skipping an empty file: {file_path}")
153
+ continue
154
+
155
+ with open(file_path, 'rb') as f:
156
+ if file_ext in ['.rst', '.md', '.txt', '.html', '.json', '.yaml', '.py']:
157
+ text = f.read().decode('utf-8')
158
+ print(f"Extracted text from {file_path}:\n{text[:200]}...\n")
159
+ elif file_ext in ['.svg']:
160
+ text = f"SVG file content from {file_path}"
161
+ elif file_ext in ['.png', '.ico']:
162
+ text = f"Image metadata from {file_path}"
163
+ else:
164
  continue
165
 
166
+ all_texts.append(text)
167
+ file_references.append(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ return all_texts, file_references
170
 
171
  # Split text into chunks
172
  def split_into_chunks(texts, references, chunk_size, chunk_overlap):
 
226
  global vectorstore, chunks, llm
227
 
228
  download_gitlab_repo()
229
+ partial_paths = ['kadi-apy-master/source/docs/setup/', 'kadi-apy-master/docs/source/usage/', 'kadi-apy-master/kadi_apy/lib/']
230
+ file_paths = ['kadi-apy-master/docs/source/usage/lib.rst']
231
+ all_texts, file_references = process_directory(REPOSITORY_DIRECTORY, partial_paths, file_paths)
232
  chunks = split_into_chunks(all_texts, file_references, CHUNK_SIZE, CHUNK_OVERLAP)
233
  vectorstore = setup_chroma(chunks, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY)
234
  llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)