Spaces:
Sleeping
Sleeping
# Utilities to build a RAG system to query information from the | |
# gwIAS search pipeline using Langchain | |
# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template | |
# https://huggingface.co/spaces/PabloVD/CAMELSDocBot | |
from langchain import hub | |
from langchain_chroma import Chroma | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain.schema import Document | |
import requests | |
import json | |
import base64 | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
def github_to_raw(url): | |
"""Convert GitHub URL to raw content URL""" | |
return url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/") | |
def load_github_notebook(url): | |
"""Load Jupyter notebook from GitHub URL using GitHub API""" | |
try: | |
# Convert GitHub blob URL to API URL | |
if "github.com" in url and "/blob/" in url: | |
# Extract owner, repo, branch and path from URL | |
parts = url.replace("https://github.com/", "").split("/") | |
owner = parts[0] | |
repo = parts[1] | |
branch = parts[3] # usually 'main' or 'master' | |
path = "/".join(parts[4:]) | |
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}" | |
else: | |
raise ValueError("URL must be a GitHub blob URL") | |
# Fetch notebook content | |
response = requests.get(api_url) | |
response.raise_for_status() | |
content_data = response.json() | |
if content_data.get('encoding') == 'base64': | |
notebook_content = base64.b64decode(content_data['content']).decode('utf-8') | |
else: | |
notebook_content = content_data['content'] | |
# Parse notebook JSON | |
notebook = json.loads(notebook_content) | |
docs = [] | |
cell_count = 0 | |
# Process each cell | |
for cell in notebook.get('cells', []): | |
cell_count += 1 | |
cell_type = cell.get('cell_type', 'unknown') | |
source = cell.get('source', []) | |
# Join source lines | |
if isinstance(source, list): | |
content = ''.join(source) | |
else: | |
content = str(source) | |
if content.strip(): # Only add non-empty cells | |
metadata = { | |
'source': url, | |
'cell_type': cell_type, | |
'cell_number': cell_count, | |
'name': f"{url} - Cell {cell_count} ({cell_type})" | |
} | |
# Add cell type prefix for better context | |
formatted_content = f"[{cell_type.upper()} CELL {cell_count}]\n{content}" | |
docs.append(Document(page_content=formatted_content, metadata=metadata)) | |
return docs | |
except Exception as e: | |
print(f"Error loading notebook from {url}: {str(e)}") | |
return [] | |
def clean_text(text): | |
"""Clean text content from a webpage""" | |
# Remove excessive newlines | |
text = re.sub(r'\n{3,}', '\n\n', text) | |
# Remove excessive whitespace | |
text = re.sub(r'\s{2,}', ' ', text) | |
return text.strip() | |
def clean_github_content(html_content): | |
"""Extract meaningful content from GitHub pages""" | |
# Ensure we're working with a BeautifulSoup object | |
if isinstance(html_content, str): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
else: | |
soup = html_content | |
# Remove navigation, footer, and other boilerplate | |
for element in soup.find_all(['nav', 'footer', 'header']): | |
element.decompose() | |
# For README and code files | |
readme_content = soup.find('article', class_='markdown-body') | |
if readme_content: | |
return clean_text(readme_content.get_text()) | |
# For code files | |
code_content = soup.find('table', class_='highlight') | |
if code_content: | |
return clean_text(code_content.get_text()) | |
# For directory listings | |
file_list = soup.find('div', role='grid') | |
if file_list: | |
return clean_text(file_list.get_text()) | |
# Fallback to main content | |
main_content = soup.find('main') | |
if main_content: | |
return clean_text(main_content.get_text()) | |
# If no specific content found, get text from body | |
body = soup.find('body') | |
if body: | |
return clean_text(body.get_text()) | |
# Final fallback | |
return clean_text(soup.get_text()) | |
class GitHubLoader(WebBaseLoader): | |
"""Custom loader for GitHub pages with better content cleaning""" | |
def clean_text(self, text): | |
"""Clean text content""" | |
# Remove excessive newlines and spaces | |
text = re.sub(r'\n{2,}', '\n', text) | |
text = re.sub(r'\s{2,}', ' ', text) | |
# Remove common GitHub boilerplate | |
text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text) | |
return text.strip() | |
def lazy_load(self) -> list[Document]: | |
"""Override lazy_load instead of _scrape to handle both BeautifulSoup and string returns.""" | |
for url in self.web_paths: | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
# For directory listings (tree URLs), use the API | |
if '/tree/' in url: | |
# Parse URL components | |
parts = url.replace("https://github.com/", "").split("/") | |
owner = parts[0] | |
repo = parts[1] | |
branch = parts[3] # usually 'main' or 'master' | |
path = "/".join(parts[4:]) if len(parts) > 4 else "" | |
# Construct API URL | |
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}" | |
api_response = requests.get(api_url) | |
api_response.raise_for_status() | |
# Parse directory listing | |
contents = api_response.json() | |
if isinstance(contents, list): | |
# Format directory contents | |
content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents]) | |
yield Document( | |
page_content=self.clean_text(content), | |
metadata={'source': url, 'type': 'github_directory'} | |
) | |
continue | |
# For regular files, parse HTML | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# For README and markdown files | |
readme_content = soup.find('article', class_='markdown-body') | |
if readme_content: | |
yield Document( | |
page_content=self.clean_text(readme_content.get_text()), | |
metadata={'source': url, 'type': 'github_markdown'} | |
) | |
continue | |
# For code files | |
code_content = soup.find('table', class_='highlight') | |
if code_content: | |
yield Document( | |
page_content=self.clean_text(code_content.get_text()), | |
metadata={'source': url, 'type': 'github_code'} | |
) | |
continue | |
# For other content, get main content | |
main_content = soup.find('main') | |
if main_content: | |
yield Document( | |
page_content=self.clean_text(main_content.get_text()), | |
metadata={'source': url, 'type': 'github_other'} | |
) | |
continue | |
# Fallback to whole page content | |
yield Document( | |
page_content=self.clean_text(soup.get_text()), | |
metadata={'source': url, 'type': 'github_fallback'} | |
) | |
except Exception as e: | |
print(f"Error processing {url}: {str(e)}") | |
continue | |
def load(self) -> list[Document]: | |
"""Load method that returns a list of documents.""" | |
return list(self.lazy_load()) | |
class ReadTheDocsLoader(WebBaseLoader): | |
"""Custom loader for ReadTheDocs pages""" | |
def __init__(self, base_url: str): | |
"""Initialize with base URL of the documentation.""" | |
super().__init__([]) | |
self.base_url = base_url.rstrip('/') | |
def clean_text(self, text: str) -> str: | |
"""Clean text content from ReadTheDocs pages.""" | |
# Remove excessive whitespace and newlines | |
text = re.sub(r'\s{2,}', ' ', text) | |
text = re.sub(r'\n{3,}', '\n\n', text) | |
# Remove common ReadTheDocs boilerplate | |
text = re.sub(r'View page source|Next|Previous|©.*?\.', '', text) | |
return text.strip() | |
def normalize_url(self, base_url: str, href: str) -> str: | |
"""Normalize relative URLs to absolute URLs.""" | |
# If it's already an absolute URL, return it | |
if href.startswith(('http://', 'https://')): | |
return href | |
# Handle relative URLs | |
return urljoin(base_url, href) | |
def get_all_pages(self) -> list[str]: | |
"""Get all documentation pages starting from the base URL.""" | |
visited = set() | |
to_visit = {self.base_url} | |
docs_urls = set() | |
while to_visit: | |
url = to_visit.pop() | |
if url in visited: | |
continue | |
visited.add(url) | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Add current page if it's a documentation page | |
if url.startswith(self.base_url): | |
docs_urls.add(url) | |
# Find all links | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if not href: | |
continue | |
# Skip anchor links and external links | |
if href.startswith('#') or href.startswith(('http://', 'https://')) and not href.startswith(self.base_url): | |
continue | |
# Normalize the URL | |
full_url = self.normalize_url(url, href) | |
# Only follow links within the documentation domain | |
if full_url.startswith(self.base_url): | |
to_visit.add(full_url) | |
except Exception as e: | |
print(f"Error fetching {url}: {str(e)}") | |
return list(docs_urls) | |
def load(self) -> list[Document]: | |
"""Load all documentation pages.""" | |
urls = self.get_all_pages() | |
docs = [] | |
for url in urls: | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Get main content | |
main_content = soup.find('div', {'role': 'main'}) | |
if not main_content: | |
main_content = soup.find('main') | |
if not main_content: | |
continue | |
# Clean content | |
content = self.clean_text(main_content.get_text()) | |
if content: | |
docs.append(Document( | |
page_content=content, | |
metadata={'source': url, 'type': 'readthedocs'} | |
)) | |
except Exception as e: | |
print(f"Error processing {url}: {str(e)}") | |
return docs | |
def load_docs(): | |
"""Load all documentation.""" | |
# Get urls | |
with open("urls.txt", "r") as f: | |
urls = [line.strip() for line in f.readlines()] | |
docs = [] | |
# Load GitHub content | |
for url in urls: | |
if "github.com" in url or "raw.githubusercontent.com" in url: | |
if "/blob/" in url and url.endswith(".ipynb"): | |
# Handle Jupyter notebooks | |
notebook_docs = load_github_notebook(url) | |
docs.extend(notebook_docs) | |
elif "raw.githubusercontent.com" in url: | |
# Handle raw GitHub content directly | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
content = response.text | |
docs.append(Document( | |
page_content=content, | |
metadata={'source': url, 'type': 'github_raw'} | |
)) | |
except Exception as e: | |
print(f"Error loading raw content from {url}: {str(e)}") | |
else: | |
# Handle other GitHub content | |
loader = GitHubLoader([url]) | |
docs.extend(loader.load()) | |
# Load ReadTheDocs content | |
rtd_loader = ReadTheDocsLoader("https://gwfast.readthedocs.io/en/latest") | |
docs.extend(rtd_loader.load()) | |
return docs | |
def extract_reference(url): | |
"""Extract a reference keyword from the GitHub URL""" | |
if "blob/main" in url: | |
return url.split("blob/main/")[-1] | |
elif "tree/main" in url: | |
return url.split("tree/main/")[-1] or "root" | |
elif "blob/master" in url: | |
return url.split("blob/master/")[-1] | |
elif "tree/master" in url: | |
return url.split("tree/master/")[-1] or "root" | |
elif "refs/heads/master" in url: | |
return url.split("refs/heads/master/")[-1] | |
return url | |
# Join content pages for processing | |
def format_docs(docs): | |
formatted_docs = [] | |
for doc in docs: | |
source = doc.metadata.get('source', 'Unknown source') | |
reference = f"[{extract_reference(source)}]" | |
content = doc.page_content | |
formatted_docs.append(f"{content}\n\nReference: {reference}") | |
return "\n\n---\n\n".join(formatted_docs) | |
# Create a RAG chain | |
def RAG(llm, docs, embeddings): | |
# Split text | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
splits = text_splitter.split_documents(docs) | |
# Create vector store | |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) | |
# Retrieve and generate using the relevant snippets of the documents | |
retriever = vectorstore.as_retriever() | |
# Prompt basis example for RAG systems | |
prompt = hub.pull("rlm/rag-prompt") | |
# Adding custom instructions to the prompt | |
template = prompt.messages[0].prompt.template | |
template_parts = template.split("\nQuestion: {question}") | |
combined_template = "You are an assistant for question-answering tasks. "\ | |
+ "Use the following pieces of retrieved context to answer the question. "\ | |
+ "If you don't know the answer, just say that you don't know. "\ | |
+ "Try to keep the answer concise if possible. "\ | |
+ "Write the names of the relevant functions from the retrived code and include code snippets to aid the user's understanding. "\ | |
+ "Include the references used in square brackets at the end of your answer."\ | |
+ template_parts[1] | |
prompt.messages[0].prompt.template = combined_template | |
# Create the chain | |
rag_chain = ( | |
{"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| prompt | |
| llm | |
| StrOutputParser() | |
) | |
return rag_chain |