gwfast_bot

Sleeping

File size: 16,258 Bytes

# Utilities to build a RAG system to query information from the
#  gwIAS search pipeline using Langchain

# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
# https://huggingface.co/spaces/PabloVD/CAMELSDocBot

from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain.schema import Document
import requests
import json
import base64
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse

def github_to_raw(url):
    """Convert GitHub URL to raw content URL"""
    return url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

def load_github_notebook(url):
    """Load Jupyter notebook from GitHub URL using GitHub API"""
    try:
        # Convert GitHub blob URL to API URL
        if "github.com" in url and "/blob/" in url:
            # Extract owner, repo, branch and path from URL
            parts = url.replace("https://github.com/", "").split("/")
            owner = parts[0]
            repo = parts[1]
            branch = parts[3]  # usually 'main' or 'master'
            path = "/".join(parts[4:])
            
            api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
        else:
            raise ValueError("URL must be a GitHub blob URL")
        
        # Fetch notebook content
        response = requests.get(api_url)
        response.raise_for_status()
        
        content_data = response.json()
        if content_data.get('encoding') == 'base64':
            notebook_content = base64.b64decode(content_data['content']).decode('utf-8')
        else:
            notebook_content = content_data['content']
        
        # Parse notebook JSON
        notebook = json.loads(notebook_content)
        
        docs = []
        cell_count = 0
        
        # Process each cell
        for cell in notebook.get('cells', []):
            cell_count += 1
            cell_type = cell.get('cell_type', 'unknown')
            source = cell.get('source', [])
            
            # Join source lines
            if isinstance(source, list):
                content = ''.join(source)
            else:
                content = str(source)
            
            if content.strip():  # Only add non-empty cells
                metadata = {
                    'source': url,
                    'cell_type': cell_type,
                    'cell_number': cell_count,
                    'name': f"{url} - Cell {cell_count} ({cell_type})"
                }
                
                # Add cell type prefix for better context
                formatted_content = f"[{cell_type.upper()} CELL {cell_count}]\n{content}"
                
                docs.append(Document(page_content=formatted_content, metadata=metadata))
        
        return docs
        
    except Exception as e:
        print(f"Error loading notebook from {url}: {str(e)}")
        return []

def clean_text(text):
    """Clean text content from a webpage"""
    # Remove excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Remove excessive whitespace
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

def clean_github_content(html_content):
    """Extract meaningful content from GitHub pages"""
    # Ensure we're working with a BeautifulSoup object
    if isinstance(html_content, str):
        soup = BeautifulSoup(html_content, 'html.parser')
    else:
        soup = html_content
    
    # Remove navigation, footer, and other boilerplate
    for element in soup.find_all(['nav', 'footer', 'header']):
        element.decompose()
        
    # For README and code files
    readme_content = soup.find('article', class_='markdown-body')
    if readme_content:
        return clean_text(readme_content.get_text())
    
    # For code files
    code_content = soup.find('table', class_='highlight')
    if code_content:
        return clean_text(code_content.get_text())
        
    # For directory listings
    file_list = soup.find('div', role='grid')
    if file_list:
        return clean_text(file_list.get_text())
        
    # Fallback to main content
    main_content = soup.find('main')
    if main_content:
        return clean_text(main_content.get_text())
    
    # If no specific content found, get text from body
    body = soup.find('body')
    if body:
        return clean_text(body.get_text())
        
    # Final fallback
    return clean_text(soup.get_text())

class GitHubLoader(WebBaseLoader):
    """Custom loader for GitHub pages with better content cleaning"""
    
    def clean_text(self, text):
        """Clean text content"""
        # Remove excessive newlines and spaces
        text = re.sub(r'\n{2,}', '\n', text)
        text = re.sub(r'\s{2,}', ' ', text)
        # Remove common GitHub boilerplate
        text = re.sub(r'Skip to content|Sign in|Search or jump to|Footer navigation|Terms|Privacy|Security|Status|Docs', '', text)
        return text.strip()

    def lazy_load(self) -> list[Document]:
        """Override lazy_load instead of _scrape to handle both BeautifulSoup and string returns."""
        for url in self.web_paths:
            try:
                response = requests.get(url)
                response.raise_for_status()
                
                # For directory listings (tree URLs), use the API
                if '/tree/' in url:
                    # Parse URL components
                    parts = url.replace("https://github.com/", "").split("/")
                    owner = parts[0]
                    repo = parts[1]
                    branch = parts[3]  # usually 'main' or 'master'
                    path = "/".join(parts[4:]) if len(parts) > 4 else ""
                    
                    # Construct API URL
                    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
                    api_response = requests.get(api_url)
                    api_response.raise_for_status()
                    
                    # Parse directory listing
                    contents = api_response.json()
                    if isinstance(contents, list):
                        # Format directory contents
                        content = "Directory contents:\n" + "\n".join([f"{item['name']} ({item['type']})" for item in contents])
                        yield Document(
                            page_content=self.clean_text(content),
                            metadata={'source': url, 'type': 'github_directory'}
                        )
                        continue
                
                # For regular files, parse HTML
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # For README and markdown files
                readme_content = soup.find('article', class_='markdown-body')
                if readme_content:
                    yield Document(
                        page_content=self.clean_text(readme_content.get_text()),
                        metadata={'source': url, 'type': 'github_markdown'}
                    )
                    continue
                
                # For code files
                code_content = soup.find('table', class_='highlight')
                if code_content:
                    yield Document(
                        page_content=self.clean_text(code_content.get_text()),
                        metadata={'source': url, 'type': 'github_code'}
                    )
                    continue
                    
                # For other content, get main content
                main_content = soup.find('main')
                if main_content:
                    yield Document(
                        page_content=self.clean_text(main_content.get_text()),
                        metadata={'source': url, 'type': 'github_other'}
                    )
                    continue
                
                # Fallback to whole page content
                yield Document(
                    page_content=self.clean_text(soup.get_text()),
                    metadata={'source': url, 'type': 'github_fallback'}
                )
                    
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
                continue

    def load(self) -> list[Document]:
        """Load method that returns a list of documents."""
        return list(self.lazy_load())

class ReadTheDocsLoader(WebBaseLoader):
    """Custom loader for ReadTheDocs pages"""
    
    def __init__(self, base_url: str):
        """Initialize with base URL of the documentation."""
        super().__init__([])
        self.base_url = base_url.rstrip('/')
        
    def clean_text(self, text: str) -> str:
        """Clean text content from ReadTheDocs pages."""
        # Remove excessive whitespace and newlines
        text = re.sub(r'\s{2,}', ' ', text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Remove common ReadTheDocs boilerplate
        text = re.sub(r'View page source|Next|Previous|©.*?\.', '', text)
        return text.strip()
        
    def normalize_url(self, base_url: str, href: str) -> str:
        """Normalize relative URLs to absolute URLs."""
        # If it's already an absolute URL, return it
        if href.startswith(('http://', 'https://')):
            return href
            
        # Handle relative URLs
        return urljoin(base_url, href)
        
    def get_all_pages(self) -> list[str]:
        """Get all documentation pages starting from the base URL."""
        visited = set()
        to_visit = {self.base_url}
        docs_urls = set()
        
        while to_visit:
            url = to_visit.pop()
            if url in visited:
                continue
                
            visited.add(url)
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Add current page if it's a documentation page
                if url.startswith(self.base_url):
                    docs_urls.add(url)
                
                # Find all links
                for link in soup.find_all('a'):
                    href = link.get('href')
                    if not href:
                        continue
                        
                    # Skip anchor links and external links
                    if href.startswith('#') or href.startswith(('http://', 'https://')) and not href.startswith(self.base_url):
                        continue
                        
                    # Normalize the URL
                    full_url = self.normalize_url(url, href)
                    
                    # Only follow links within the documentation domain
                    if full_url.startswith(self.base_url):
                        to_visit.add(full_url)
                        
            except Exception as e:
                print(f"Error fetching {url}: {str(e)}")
                
        return list(docs_urls)
        
    def load(self) -> list[Document]:
        """Load all documentation pages."""
        urls = self.get_all_pages()
        docs = []
        
        for url in urls:
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Get main content
                main_content = soup.find('div', {'role': 'main'})
                if not main_content:
                    main_content = soup.find('main')
                if not main_content:
                    continue
                    
                # Clean content
                content = self.clean_text(main_content.get_text())
                if content:
                    docs.append(Document(
                        page_content=content,
                        metadata={'source': url, 'type': 'readthedocs'}
                    ))
                    
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
                
        return docs

def load_docs():
    """Load all documentation."""
    # Get urls
    with open("urls.txt", "r") as f:
        urls = [line.strip() for line in f.readlines()]
    
    docs = []
    
    # Load GitHub content
    for url in urls:
        if "github.com" in url or "raw.githubusercontent.com" in url:
            if "/blob/" in url and url.endswith(".ipynb"):
                # Handle Jupyter notebooks
                notebook_docs = load_github_notebook(url)
                docs.extend(notebook_docs)
            elif "raw.githubusercontent.com" in url:
                # Handle raw GitHub content directly
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    content = response.text
                    docs.append(Document(
                        page_content=content,
                        metadata={'source': url, 'type': 'github_raw'}
                    ))
                except Exception as e:
                    print(f"Error loading raw content from {url}: {str(e)}")
            else:
                # Handle other GitHub content
                loader = GitHubLoader([url])
                docs.extend(loader.load())
                
    # Load ReadTheDocs content
    rtd_loader = ReadTheDocsLoader("https://gwfast.readthedocs.io/en/latest")
    docs.extend(rtd_loader.load())
    
    return docs

def extract_reference(url):
    """Extract a reference keyword from the GitHub URL"""
    if "blob/main" in url:
        return url.split("blob/main/")[-1]
    elif "tree/main" in url:
        return url.split("tree/main/")[-1] or "root"
    elif "blob/master" in url:
        return url.split("blob/master/")[-1]
    elif "tree/master" in url:
        return url.split("tree/master/")[-1] or "root"
    elif "refs/heads/master" in url:
        return url.split("refs/heads/master/")[-1]
    return url

# Join content pages for processing
def format_docs(docs):
    formatted_docs = []
    for doc in docs:
        source = doc.metadata.get('source', 'Unknown source')
        reference = f"[{extract_reference(source)}]"
        content = doc.page_content
        formatted_docs.append(f"{content}\n\nReference: {reference}")
    return "\n\n---\n\n".join(formatted_docs)

# Create a RAG chain
def RAG(llm, docs, embeddings):

    # Split text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Create vector store
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

    # Retrieve and generate using the relevant snippets of the documents
    retriever = vectorstore.as_retriever()

    # Prompt basis example for RAG systems
    prompt = hub.pull("rlm/rag-prompt")
    # Adding custom instructions to the prompt
    template = prompt.messages[0].prompt.template
    template_parts = template.split("\nQuestion: {question}")
    combined_template = "You are an assistant for question-answering tasks. "\
        + "Use the following pieces of retrieved context to answer the question. "\
        + "If you don't know the answer, just say that you don't know. "\
        + "Try to keep the answer concise if possible. "\
        + "Write the names of the relevant functions from the retrived code and include code snippets to aid the user's understanding. "\
        + "Include the references used in square brackets at the end of your answer."\
        + template_parts[1]
    prompt.messages[0].prompt.template = combined_template

    # Create the chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    return rag_chain