Aiswarya Sankar commited on
Commit
785e02e
·
1 Parent(s): 383468c

Update app

Browse files
Files changed (1) hide show
  1. app.py +53 -53
app.py CHANGED
@@ -113,54 +113,62 @@ def index_repo(textbox: str, dropdown: str) -> Response:
113
  root_dir = './' + pathName
114
 
115
  activeloop_username = "aiswaryas"
116
- dataset_path = f"hub://{activeloop_username}/" + "dummy"
117
  invalid_dataset_path = True
118
 
119
- # try:
120
- # try:
121
- # db = DeepLake(dataset_path=dataset_path,
122
- # embedding_function=embeddings,
123
- # token=os.environ['ACTIVELOOP_TOKEN'],
124
- # read_only=True,
125
- # num_workers=12,
126
- # runtime = {"tensor_db": True}
127
- # )
128
- # except Exception as e:
129
- # print("Failed to read: " + str(e))
130
- # if "scheduled for deletion" in str(e):
131
- # dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
132
- # invalid_dataset_path = True
133
-
134
- # if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
135
- # print("Dataset doesn't exist, fetching data")
136
  try:
137
- docs = []
138
- for dirpath, dirnames, filenames in os.walk(root_dir):
139
- for file in filenames:
140
- print(file)
141
- try:
142
- loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
143
- docs.extend(loader.load_and_split())
144
- except Exception as e:
145
- print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
146
- pass
147
-
148
- activeloop_username = "aiswaryas"
149
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
150
- texts = text_splitter.split_documents(docs)
151
-
152
- db = DeepLake(dataset_path=dataset_path,
153
- embedding_function=embeddings,
154
- token=os.environ['ACTIVELOOP_TOKEN'],
155
- read_only=False,
156
- num_workers=12,
157
- runtime = {"tensor_db": True}
158
- )
159
- # Do this in chunks to avoid hitting the ratelimit immediately
160
- for i in range(0, len(texts), 500):
161
- print("Adding documents " + str(i))
162
- db.add_documents(texts[i:i+500])
163
- time.sleep(.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  except Exception as e:
166
  return Response(
@@ -170,14 +178,6 @@ def index_repo(textbox: str, dropdown: str) -> Response:
170
  stdout="",
171
  )
172
 
173
- # except Exception as e:
174
- # return Response(
175
- # result= "Failed to index github repo",
176
- # repo="",
177
- # error=str(e),
178
- # stdout="",
179
- # )
180
-
181
  vector_db_url.value = dataset_path
182
 
183
  return {
 
113
  root_dir = './' + pathName
114
 
115
  activeloop_username = "aiswaryas"
116
+ dataset_path = f"hub://{activeloop_username}/" + pathName
117
  invalid_dataset_path = True
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  try:
120
+ try:
121
+ db = DeepLake(dataset_path=dataset_path,
122
+ embedding_function=embeddings,
123
+ token=os.environ['ACTIVELOOP_TOKEN'],
124
+ read_only=True,
125
+ num_workers=12,
126
+ runtime = {"tensor_db": True}
127
+ )
128
+ except Exception as e:
129
+ print("Failed to read: " + str(e))
130
+ if "scheduled for deletion" in str(e):
131
+ dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
132
+ invalid_dataset_path = True
133
+
134
+ if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
135
+ print("Dataset doesn't exist, fetching data")
136
+ try:
137
+ docs = []
138
+ for dirpath, dirnames, filenames in os.walk(root_dir):
139
+ for file in filenames:
140
+ print(file)
141
+ try:
142
+ loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
143
+ docs.extend(loader.load_and_split())
144
+ except Exception as e:
145
+ print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
146
+ pass
147
+
148
+ activeloop_username = "aiswaryas"
149
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
150
+ texts = text_splitter.split_documents(docs)
151
+
152
+ db = DeepLake(dataset_path=dataset_path,
153
+ embedding_function=embeddings,
154
+ token=os.environ['ACTIVELOOP_TOKEN'],
155
+ read_only=False,
156
+ num_workers=12,
157
+ runtime = {"tensor_db": True}
158
+ )
159
+ # Do this in chunks to avoid hitting the ratelimit immediately
160
+ for i in range(0, len(texts), 500):
161
+ print("Adding documents " + str(i))
162
+ db.add_documents(texts[i:i+500])
163
+ time.sleep(.5)
164
+
165
+ except Exception as e:
166
+ return Response(
167
+ result= "Failed to index github repo",
168
+ repo="",
169
+ error=str(e),
170
+ stdout="",
171
+ )
172
 
173
  except Exception as e:
174
  return Response(
 
178
  stdout="",
179
  )
180
 
 
 
 
 
 
 
 
 
181
  vector_db_url.value = dataset_path
182
 
183
  return {