Spaces:

bupa1018
/

KadiAPY_Coding_Assistant

Sleeping

App Files Files Community

bupa1018 commited on Feb 27

Commit

151e771

1 Parent(s): 835b679

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -30

app.py CHANGED Viewed

@@ -90,7 +90,28 @@ def download_gitlab_repo():
         )
         print("Upload complete")
-def process_directory(directory):
     all_texts = []
     file_references = []
@@ -105,7 +126,7 @@ def process_directory(directory):
         return all_texts, file_references
     else:
         zip_file_path = os.path.join(directory, zip_files[0])
         # Create a temporary directory for the zip file
         with tempfile.TemporaryDirectory() as tmpdirname:
             # Unzip the file into the temporary directory
@@ -113,36 +134,39 @@ def process_directory(directory):
                 zip_ref.extractall(tmpdirname)
             print(f"Extracted {zip_file_path} to {tmpdirname}")
-            # Process the files in the temporary directory
-            for root, _, files in os.walk(tmpdirname):
-                for file in files:
-                    print(f"Any files??: {file}")
-                    file_path = os.path.join(root, file)
-                    file_ext = os.path.splitext(file_path)[1]
-                    if os.path.getsize(file_path) == 0:
-                        print(f"Skipping an empty file: {file_path}")
                         continue
-                    with open(file_path, 'rb') as f:
-                        if file_ext in ['.rst', '.md', '.txt', '.html', '.json', '.yaml', '.py']:
-                            text = f.read().decode('utf-8')
-                            print(f"Extracted text from {file_path}:\n{text[:200]}...\n")
-                        elif file_ext in ['.svg']:
-                            text = f"SVG file content from {file_path}"
-                        elif file_ext in ['.png', '.ico']:
-                            text = f"Image metadata from {file_path}"
-                        else:
-                            continue
-                        all_texts.append(text)
-                        file_references.append(file_path)
-    print(f"All extracted texts:\n{all_texts}")
-    return all_texts, file_references
 # Split text into chunks
 def split_into_chunks(texts, references, chunk_size, chunk_overlap):
@@ -202,7 +226,9 @@ def initialize():
     global vectorstore, chunks, llm
     download_gitlab_repo()
-    all_texts, file_references = process_directory(REPOSITORY_DIRECTORY)
     chunks = split_into_chunks(all_texts, file_references, CHUNK_SIZE, CHUNK_OVERLAP)
     vectorstore = setup_chroma(chunks, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY)
     llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)

         )
         print("Upload complete")
+def get_all_files_in_folder(temp_dir, partial_paths):
+    all_files = []
+    for partial_path in partial_paths:
+        target_dir = os.path.join(temp_dir, partial_path)
+        print(target_dir)
+        for root, _, files in os.walk(target_dir):
+            print(f"Files in current directory ({root}): {files}")
+            for file in files:
+                print(f"Processing file: {file}")
+                all_files.append(os.path.join(root, file))
+    return all_files
+def get_file(temp_dir, file_path):
+    full_path = os.path.join(temp_dir, file_path)
+    return full_path
+def process_directory(directory, partial_paths=None, file_paths=None):
     all_texts = []
     file_references = []
         return all_texts, file_references
     else:
         zip_file_path = os.path.join(directory, zip_files[0])
         # Create a temporary directory for the zip file
         with tempfile.TemporaryDirectory() as tmpdirname:
             # Unzip the file into the temporary directory
                 zip_ref.extractall(tmpdirname)
             print(f"Extracted {zip_file_path} to {tmpdirname}")
+            if partial_paths:
+                files = get_all_files_in_folder(tmpdirname, partial_paths)
+            else:
+                files = []
+                for root, _, files_list in os.walk(tmpdirname):
+                    for file in files_list:
+                        files.append(os.path.join(root, file))
+            if file_paths:
+                files += [get_file(tmpdirname, file_path) for file_path in file_paths]
+            for file_path in files:
+                file_ext = os.path.splitext(file_path)[1]
+                if os.path.getsize(file_path) == 0:
+                    print(f"Skipping an empty file: {file_path}")
+                    continue
+                with open(file_path, 'rb') as f:
+                    if file_ext in ['.rst', '.md', '.txt', '.html', '.json', '.yaml', '.py']:
+                        text = f.read().decode('utf-8')
+                        print(f"Extracted text from {file_path}:\n{text[:200]}...\n")
+                    elif file_ext in ['.svg']:
+                        text = f"SVG file content from {file_path}"
+                    elif file_ext in ['.png', '.ico']:
+                        text = f"Image metadata from {file_path}"
+                    else:
                         continue
+                    all_texts.append(text)
+                    file_references.append(file_path)
+    return all_texts, file_references
 # Split text into chunks
 def split_into_chunks(texts, references, chunk_size, chunk_overlap):
     global vectorstore, chunks, llm
     download_gitlab_repo()
+    partial_paths = ['kadi-apy-master/source/docs/setup/', 'kadi-apy-master/docs/source/usage/', 'kadi-apy-master/kadi_apy/lib/']
+    file_paths = ['kadi-apy-master/docs/source/usage/lib.rst']
+    all_texts, file_references = process_directory(REPOSITORY_DIRECTORY, partial_paths, file_paths)
     chunks = split_into_chunks(all_texts, file_references, CHUNK_SIZE, CHUNK_OVERLAP)
     vectorstore = setup_chroma(chunks, EMBEDDING_MODEL_NAME, PERSIST_DIRECTORY)
     llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)