Spaces:
Running
Running
File size: 5,618 Bytes
c7143b1 7406911 c7143b1 7406911 c7143b1 7406911 c7143b1 3f21537 7406911 3f21537 c7143b1 3f21537 c7143b1 3f21537 c7143b1 3f21537 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import spacy
from itertools import groupby
from operator import itemgetter
from langsmith import traceable
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
def get_nlp_model():
"""
Load and return the spaCy NLP model. Downloads the model if not already installed.
Returns:
nlp: The loaded spaCy NLP model.
"""
if not spacy.util.is_package("en_core_web_md"):
print("Downloading en_core_web_md model...")
spacy.cli.download("en_core_web_md")
print("Model downloaded successfully!")
nlp = spacy.load("en_core_web_md")
return nlp
def recursive_split_documents(contents, max_chunk_size=1000, overlap=100):
"""
Split documents into smaller chunks using a recursive character text splitter.
Args:
contents (list): List of content dictionaries with 'page_content', 'title', and 'link'.
max_chunk_size (int): Maximum size of each chunk.
overlap (int): Overlap between chunks.
Returns:
list: List of chunks with text and metadata.
"""
from langchain_core.documents.base import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
documents = []
for content in contents:
try:
page_content = content['page_content']
if page_content:
metadata = {'title': content['title'], 'source': content['link']}
doc = Document(page_content=content['page_content'], metadata=metadata)
documents.append(doc)
except Exception as e:
print(f"Error processing content for {content['link']}: {e}")
# Initialize recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=overlap)
# Split documents
split_documents = text_splitter.split_documents(documents)
# Convert split documents to the same format as recursive_split
chunks = []
for doc in split_documents:
chunk = {
'text': doc.page_content,
'metadata': {
'title': doc.metadata.get('title', ''),
'source': doc.metadata.get('source', '')
}
}
chunks.append(chunk)
return chunks
def semantic_search(query, chunks, nlp, similarity_threshold=0.5, top_n=10):
"""
Perform semantic search to find relevant chunks based on similarity to the query.
Args:
query (str): The search query.
chunks (list): List of text chunks with vectors.
nlp: The spaCy NLP model.
similarity_threshold (float): Minimum similarity score to consider a chunk relevant.
top_n (int): Number of top relevant chunks to return.
Returns:
list: List of relevant chunks and their similarity scores.
"""
# Precompute query vector and its norm
query_vector = nlp(query).vector
query_norm = np.linalg.norm(query_vector) + 1e-8 # Add epsilon to avoid division by zero
# Check if chunks have precomputed vectors; if not, compute them
if 'vector' not in chunks[0]:
texts = [chunk['text'] for chunk in chunks]
# Process texts in batches using nlp.pipe()
batch_size = 1000 # Adjust based on available memory
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'tok2vec']):
docs = nlp.pipe(texts, batch_size=batch_size)
# Add vectors to chunks
for chunk, doc in zip(chunks, docs):
chunk['vector'] = doc.vector
# Prepare chunk vectors and norms
chunk_vectors = np.array([chunk['vector'] for chunk in chunks])
chunk_norms = np.linalg.norm(chunk_vectors, axis=1) + 1e-8 # Add epsilon to avoid division by zero
# Compute similarities
similarities = np.dot(chunk_vectors, query_vector) / (chunk_norms * query_norm)
# Filter and sort results
relevant_chunks = [
(chunk, sim) for chunk, sim in zip(chunks, similarities) if sim > similarity_threshold
]
relevant_chunks.sort(key=lambda x: x[1], reverse=True)
return relevant_chunks[:top_n]
@traceable(run_type="llm", name="nlp_rag")
def query_rag(chat_llm, query, relevant_results, callbacks = []):
"""
Generate a response using retrieval-augmented generation (RAG) based on relevant results.
Args:
chat_llm: The chat language model to use.
query (str): The user's query.
relevant_results (list): List of relevant chunks and their similarity scores.
Returns:
str: The generated response.
"""
prompt = build_rag_prompt(query, relevant_results)
response = chat_llm.invoke(prompt).content
return response
def build_rag_prompt(query, relevant_results):
import web_rag as wr
formatted_chunks = format_docs(relevant_results)
prompt = wr.get_rag_prompt_template().format(query=query, context=formatted_chunks)
return prompt
def format_docs(relevant_results):
"""
Convert relevant search results into a JSON-formatted string.
Args:
relevant_results (list): List of relevant chunks with metadata.
Returns:
str: JSON-formatted string of document chunks.
"""
import json
formatted_chunks = []
for chunk, _ in relevant_results: # Unpack the tuple, ignore similarity score
formatted_chunk = {
"content": chunk['text'],
"link": chunk['metadata'].get('source', ''),
"title": chunk['metadata'].get('title', ''),
}
formatted_chunks.append(formatted_chunk)
return json.dumps(formatted_chunks, indent=2) |