jaywadekar commited on
Commit
6c93f12
·
1 Parent(s): 215eca5

added support for docs

Browse files
Files changed (2) hide show
  1. rag.py +198 -123
  2. urls.txt +7 -7
rag.py CHANGED
@@ -16,6 +16,7 @@ import json
16
  import base64
17
  from bs4 import BeautifulSoup
18
  import re
 
19
 
20
  def github_to_raw(url):
21
  """Convert GitHub URL to raw content URL"""
@@ -144,144 +145,218 @@ class GitHubLoader(WebBaseLoader):
144
  text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
145
  return text.strip()
146
 
147
- def _scrape(self, url: str, *args, **kwargs) -> str:
148
- """Scrape data from URL and clean it.
149
-
150
- Args:
151
- url: The URL to scrape
152
- *args: Additional positional arguments
153
- **kwargs: Additional keyword arguments including bs_kwargs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- Returns:
156
- str: The cleaned content
157
- """
158
- response = requests.get(url)
159
- response.raise_for_status()
 
 
 
160
 
161
- # For directory listings (tree URLs), use the API
162
- if '/tree/' in url:
163
- # Parse URL components
164
- parts = url.replace("https://github.com/", "").split("/")
165
- owner = parts[0]
166
- repo = parts[1]
167
- branch = parts[3] # usually 'main' or 'master'
168
- path = "/".join(parts[4:]) if len(parts) > 4 else ""
169
-
170
- # Construct API URL
171
- api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
172
- api_response = requests.get(api_url)
173
- api_response.raise_for_status()
174
 
175
- # Parse directory listing
176
- contents = api_response.json()
177
- if isinstance(contents, list):
178
- # Format directory contents
179
- files = [f"{item['name']} ({item['type']})" for item in contents]
180
- return "Directory contents:\n" + "\n".join(files)
181
- else:
182
- return f"Error: Unexpected API response for {url}"
183
 
184
- # For regular files, parse HTML
185
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
186
 
187
- # For README and markdown files
188
- readme_content = soup.find('article', class_='markdown-body')
189
- if readme_content:
190
- return self.clean_text(readme_content.get_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # For code files
193
- code_content = soup.find('table', class_='highlight')
194
- if code_content:
195
- return self.clean_text(code_content.get_text())
196
-
197
- # For other content, get main content
198
- main_content = soup.find('main')
199
- if main_content:
200
- return self.clean_text(main_content.get_text())
201
 
202
- # Final fallback
203
- return self.clean_text(soup.get_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- # Load documentation from urls
206
  def load_docs():
 
207
  # Get urls
208
- urlsfile = open("urls.txt")
209
- urls = urlsfile.readlines()
210
- urls = [url.replace("\n","") for url in urls]
211
- urlsfile.close()
212
-
213
- # Load documents from URLs
214
  docs = []
215
 
 
216
  for url in urls:
217
- url = url.strip()
218
- if not url:
219
- continue
220
-
221
- # Check if URL is a Jupyter notebook
222
- if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
223
- print(f"Loading notebook: {url}")
224
- notebook_docs = load_github_notebook(url)
225
- docs.extend(notebook_docs)
226
- # Handle Python and Markdown files using raw content
227
- elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
228
- print(f"Loading raw content: {url}")
229
- try:
230
- raw_url = github_to_raw(url)
231
- loader = WebBaseLoader([raw_url])
232
- web_docs = loader.load()
233
- # Preserve original URL in metadata
234
- for doc in web_docs:
235
- doc.metadata['source'] = url
236
- docs.extend(web_docs)
237
- except Exception as e:
238
- print(f"Error loading {url}: {str(e)}")
239
- # Handle directory listings
240
- elif '/tree/' in url and 'github.com' in url:
241
- print(f"Loading directory: {url}")
242
- try:
243
- # Parse URL components
244
- parts = url.replace("https://github.com/", "").split("/")
245
- owner = parts[0]
246
- repo = parts[1]
247
- branch = parts[3] # usually 'main' or 'master'
248
- path = "/".join(parts[4:]) if len(parts) > 4 else ""
249
-
250
- # Construct API URL
251
- api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
252
- response = requests.get(api_url)
253
- response.raise_for_status()
254
 
255
- # Parse directory listing
256
- contents = response.json()
257
- if isinstance(contents, list):
258
- # Format directory contents
259
- content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
260
- docs.append(Document(page_content=content, metadata={'source': url}))
261
- else:
262
- print(f"Error: Unexpected API response for {url}")
263
- except Exception as e:
264
- print(f"Error loading directory {url}: {str(e)}")
265
- else:
266
- print(f"Loading web page: {url}")
267
- try:
268
- loader = GitHubLoader([url]) # Use custom loader
269
- web_docs = loader.load()
270
- docs.extend(web_docs)
271
- except Exception as e:
272
- print(f"Error loading {url}: {str(e)}")
273
-
274
- # Add source URLs as document names for reference
275
- for i, doc in enumerate(docs):
276
- if 'source' in doc.metadata:
277
- doc.metadata['name'] = doc.metadata['source']
278
- else:
279
- doc.metadata['name'] = f"Document {i+1}"
280
 
281
- print(f"Loaded {len(docs)} documents:")
282
- for doc in docs:
283
- print(f" - {doc.metadata.get('name')}")
284
-
285
  return docs
286
 
287
  def extract_reference(url):
 
16
  import base64
17
  from bs4 import BeautifulSoup
18
  import re
19
+ from urllib.parse import urljoin, urlparse
20
 
21
  def github_to_raw(url):
22
  """Convert GitHub URL to raw content URL"""
 
145
  text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
146
  return text.strip()
147
 
148
+ def lazy_load(self) -> list[Document]:
149
+ """Override lazy_load instead of _scrape to handle both BeautifulSoup and string returns."""
150
+ for url in self.web_paths:
151
+ try:
152
+ response = requests.get(url)
153
+ response.raise_for_status()
154
+
155
+ # For directory listings (tree URLs), use the API
156
+ if '/tree/' in url:
157
+ # Parse URL components
158
+ parts = url.replace("https://github.com/", "").split("/")
159
+ owner = parts[0]
160
+ repo = parts[1]
161
+ branch = parts[3] # usually 'main' or 'master'
162
+ path = "/".join(parts[4:]) if len(parts) > 4 else ""
163
+
164
+ # Construct API URL
165
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
166
+ api_response = requests.get(api_url)
167
+ api_response.raise_for_status()
168
+
169
+ # Parse directory listing
170
+ contents = api_response.json()
171
+ if isinstance(contents, list):
172
+ # Format directory contents
173
+ content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
174
+ yield Document(
175
+ page_content=self.clean_text(content),
176
+ metadata={'source': url, 'type': 'github_directory'}
177
+ )
178
+ continue
179
+
180
+ # For regular files, parse HTML
181
+ soup = BeautifulSoup(response.text, 'html.parser')
182
+
183
+ # For README and markdown files
184
+ readme_content = soup.find('article', class_='markdown-body')
185
+ if readme_content:
186
+ yield Document(
187
+ page_content=self.clean_text(readme_content.get_text()),
188
+ metadata={'source': url, 'type': 'github_markdown'}
189
+ )
190
+ continue
191
+
192
+ # For code files
193
+ code_content = soup.find('table', class_='highlight')
194
+ if code_content:
195
+ yield Document(
196
+ page_content=self.clean_text(code_content.get_text()),
197
+ metadata={'source': url, 'type': 'github_code'}
198
+ )
199
+ continue
200
+
201
+ # For other content, get main content
202
+ main_content = soup.find('main')
203
+ if main_content:
204
+ yield Document(
205
+ page_content=self.clean_text(main_content.get_text()),
206
+ metadata={'source': url, 'type': 'github_other'}
207
+ )
208
+ continue
209
+
210
+ # Fallback to whole page content
211
+ yield Document(
212
+ page_content=self.clean_text(soup.get_text()),
213
+ metadata={'source': url, 'type': 'github_fallback'}
214
+ )
215
+
216
+ except Exception as e:
217
+ print(f"Error processing {url}: {str(e)}")
218
+ continue
219
+
220
+ def load(self) -> list[Document]:
221
+ """Load method that returns a list of documents."""
222
+ return list(self.lazy_load())
223
+
224
+ class ReadTheDocsLoader(WebBaseLoader):
225
+ """Custom loader for ReadTheDocs pages"""
226
+
227
+ def __init__(self, base_url: str):
228
+ """Initialize with base URL of the documentation."""
229
+ super().__init__([])
230
+ self.base_url = base_url.rstrip('/')
231
 
232
+ def clean_text(self, text: str) -> str:
233
+ """Clean text content from ReadTheDocs pages."""
234
+ # Remove excessive whitespace and newlines
235
+ text = re.sub(r'\s{2,}', ' ', text)
236
+ text = re.sub(r'\n{3,}', '\n\n', text)
237
+ # Remove common ReadTheDocs boilerplate
238
+ text = re.sub(r'View page source|Next|Previous|©.*?\.', '', text)
239
+ return text.strip()
240
 
241
+ def normalize_url(self, base_url: str, href: str) -> str:
242
+ """Normalize relative URLs to absolute URLs."""
243
+ # If it's already an absolute URL, return it
244
+ if href.startswith(('http://', 'https://')):
245
+ return href
 
 
 
 
 
 
 
 
246
 
247
+ # Handle relative URLs
248
+ return urljoin(base_url, href)
 
 
 
 
 
 
249
 
250
+ def get_all_pages(self) -> list[str]:
251
+ """Get all documentation pages starting from the base URL."""
252
+ visited = set()
253
+ to_visit = {self.base_url}
254
+ docs_urls = set()
255
 
256
+ while to_visit:
257
+ url = to_visit.pop()
258
+ if url in visited:
259
+ continue
260
+
261
+ visited.add(url)
262
+ try:
263
+ response = requests.get(url)
264
+ response.raise_for_status()
265
+ soup = BeautifulSoup(response.text, 'html.parser')
266
+
267
+ # Add current page if it's a documentation page
268
+ if url.startswith(self.base_url):
269
+ docs_urls.add(url)
270
+
271
+ # Find all links
272
+ for link in soup.find_all('a'):
273
+ href = link.get('href')
274
+ if not href:
275
+ continue
276
+
277
+ # Skip anchor links and external links
278
+ if href.startswith('#') or href.startswith(('http://', 'https://')) and not href.startswith(self.base_url):
279
+ continue
280
+
281
+ # Normalize the URL
282
+ full_url = self.normalize_url(url, href)
283
+
284
+ # Only follow links within the documentation domain
285
+ if full_url.startswith(self.base_url):
286
+ to_visit.add(full_url)
287
+
288
+ except Exception as e:
289
+ print(f"Error fetching {url}: {str(e)}")
290
+
291
+ return list(docs_urls)
292
 
293
+ def load(self) -> list[Document]:
294
+ """Load all documentation pages."""
295
+ urls = self.get_all_pages()
296
+ docs = []
 
 
 
 
 
297
 
298
+ for url in urls:
299
+ try:
300
+ response = requests.get(url)
301
+ response.raise_for_status()
302
+ soup = BeautifulSoup(response.text, 'html.parser')
303
+
304
+ # Get main content
305
+ main_content = soup.find('div', {'role': 'main'})
306
+ if not main_content:
307
+ main_content = soup.find('main')
308
+ if not main_content:
309
+ continue
310
+
311
+ # Clean content
312
+ content = self.clean_text(main_content.get_text())
313
+ if content:
314
+ docs.append(Document(
315
+ page_content=content,
316
+ metadata={'source': url, 'type': 'readthedocs'}
317
+ ))
318
+
319
+ except Exception as e:
320
+ print(f"Error processing {url}: {str(e)}")
321
+
322
+ return docs
323
 
 
324
  def load_docs():
325
+ """Load all documentation."""
326
  # Get urls
327
+ with open("urls.txt", "r") as f:
328
+ urls = [line.strip() for line in f.readlines()]
329
+
 
 
 
330
  docs = []
331
 
332
+ # Load GitHub content
333
  for url in urls:
334
+ if "github.com" in url or "raw.githubusercontent.com" in url:
335
+ if "/blob/" in url and url.endswith(".ipynb"):
336
+ # Handle Jupyter notebooks
337
+ notebook_docs = load_github_notebook(url)
338
+ docs.extend(notebook_docs)
339
+ elif "raw.githubusercontent.com" in url:
340
+ # Handle raw GitHub content directly
341
+ try:
342
+ response = requests.get(url)
343
+ response.raise_for_status()
344
+ content = response.text
345
+ docs.append(Document(
346
+ page_content=content,
347
+ metadata={'source': url, 'type': 'github_raw'}
348
+ ))
349
+ except Exception as e:
350
+ print(f"Error loading raw content from {url}: {str(e)}")
351
+ else:
352
+ # Handle other GitHub content
353
+ loader = GitHubLoader([url])
354
+ docs.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
+ # Load ReadTheDocs content
357
+ rtd_loader = ReadTheDocsLoader("https://gwfast.readthedocs.io/en/latest")
358
+ docs.extend(rtd_loader.load())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
 
 
 
 
360
  return docs
361
 
362
  def extract_reference(url):
urls.txt CHANGED
@@ -1,10 +1,10 @@
1
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/gwfast_tutorial.ipynb
2
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/new_features_tutorial.ipynb
3
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/Stochastic_tutorial.ipynb
4
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/fisherTools.py
5
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/gwfastGlobals.py
6
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/gwfastUtils.py
7
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/network.py
8
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/signal.py
9
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/waveforms.py
10
- https://github.com/CosmoStatGW/gwfast/blob/master/gwfast/stochastic/stochasticTools.py
 
1
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/gwfast_tutorial.ipynb
2
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/new_features_tutorial.ipynb
3
  https://github.com/CosmoStatGW/gwfast/blob/master/notebooks/Stochastic_tutorial.ipynb
4
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/fisherTools.py
5
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/gwfastGlobals.py
6
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/gwfastUtils.py
7
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/network.py
8
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/signal.py
9
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/waveforms.py
10
+ https://raw.githubusercontent.com/CosmoStatGW/gwfast/refs/heads/master/gwfast/stochastic/stochasticTools.py