Spaces:

aiswaryasankar
/

entelligence.ai

Sleeping

App Files Files Community

Aiswarya Sankar commited on Sep 27, 2023

Commit

785e02e

1 Parent(s): 383468c

Update app

Browse files

Files changed (1) hide show

app.py +53 -53

app.py CHANGED Viewed

@@ -113,54 +113,62 @@ def index_repo(textbox: str, dropdown: str) -> Response:
     root_dir = './' + pathName
     activeloop_username = "aiswaryas"
-    dataset_path = f"hub://{activeloop_username}/" + "dummy"
     invalid_dataset_path = True
-    # try:
-    #     try:
-    #         db = DeepLake(dataset_path=dataset_path,
-    #                 embedding_function=embeddings,
-    #                 token=os.environ['ACTIVELOOP_TOKEN'],
-    #                 read_only=True,
-    #                 num_workers=12,
-    #                 runtime = {"tensor_db": True}
-    #             )
-    #     except Exception as e:
-    #         print("Failed to read: " + str(e))
-    #         if "scheduled for deletion" in str(e):
-    #             dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
-    #             invalid_dataset_path = True
-    #     if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
-    #         print("Dataset doesn't exist, fetching data")
     try:
-        docs = []
-        for dirpath, dirnames, filenames in os.walk(root_dir):
-            for file in filenames:
-                print(file)
-                try:
-                    loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
-                    docs.extend(loader.load_and_split())
-                except Exception as e:
-                    print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
-                    pass
-        activeloop_username = "aiswaryas"
-        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-        texts = text_splitter.split_documents(docs)
-        db = DeepLake(dataset_path=dataset_path,
-                embedding_function=embeddings,
-                token=os.environ['ACTIVELOOP_TOKEN'],
-                read_only=False,
-                num_workers=12,
-                runtime = {"tensor_db": True}
-        )
-        # Do this in chunks to avoid hitting the ratelimit immediately
-        for i in range(0, len(texts), 500):
-            print("Adding documents " + str(i))
-            db.add_documents(texts[i:i+500])
-            time.sleep(.5)
     except Exception as e:
         return Response(
@@ -170,14 +178,6 @@ def index_repo(textbox: str, dropdown: str) -> Response:
             stdout="",
         )
-    # except Exception as e:
-    #     return Response(
-    #         result= "Failed to index github repo",
-    #         repo="",
-    #         error=str(e),
-    #         stdout="",
-    #     )
     vector_db_url.value = dataset_path
     return {

     root_dir = './' + pathName
     activeloop_username = "aiswaryas"
+    dataset_path = f"hub://{activeloop_username}/" + pathName
     invalid_dataset_path = True
     try:
+        try:
+            db = DeepLake(dataset_path=dataset_path,
+                    embedding_function=embeddings,
+                    token=os.environ['ACTIVELOOP_TOKEN'],
+                    read_only=True,
+                    num_workers=12,
+                    runtime = {"tensor_db": True}
+                )
+        except Exception as e:
+            print("Failed to read: " + str(e))
+            if "scheduled for deletion" in str(e):
+                dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
+                invalid_dataset_path = True
+        if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
+            print("Dataset doesn't exist, fetching data")
+            try:
+                docs = []
+                for dirpath, dirnames, filenames in os.walk(root_dir):
+                    for file in filenames:
+                        print(file)
+                        try:
+                            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
+                            docs.extend(loader.load_and_split())
+                        except Exception as e:
+                            print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
+                            pass
+                activeloop_username = "aiswaryas"
+                text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+                texts = text_splitter.split_documents(docs)
+                db = DeepLake(dataset_path=dataset_path,
+                        embedding_function=embeddings,
+                        token=os.environ['ACTIVELOOP_TOKEN'],
+                        read_only=False,
+                        num_workers=12,
+                        runtime = {"tensor_db": True}
+                )
+                # Do this in chunks to avoid hitting the ratelimit immediately
+                for i in range(0, len(texts), 500):
+                    print("Adding documents " + str(i))
+                    db.add_documents(texts[i:i+500])
+                    time.sleep(.5)
+            except Exception as e:
+                return Response(
+                    result= "Failed to index github repo",
+                    repo="",
+                    error=str(e),
+                    stdout="",
+                )
     except Exception as e:
         return Response(
             stdout="",
         )
     vector_db_url.value = dataset_path
     return {