# build_index.py import os import requests from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document from langchain_community.docstore.document import Document as LCDocument DOCS_PATH = "docs" INDEX_PATH = "faiss_index" def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]: """ Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader. Returns a list of Documents (can be 1 or multiple if you want to split further). """ try: response = requests.get(url, timeout=timeout) response.raise_for_status() # raise HTTPError if not 200 except Exception as e: print(f"[Timeout/Fetch Error] Skipping {url}: {e}") return [] # Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader # (unstructured requires a file-like, we can do in-memory, but let's keep it simple) temp_filename = "temp_html_file.html" with open(temp_filename, "w", encoding="utf-8") as f: f.write(response.text) loader = UnstructuredHTMLLoader(temp_filename) docs = loader.load() # returns a list of Document objects for doc in docs: doc.metadata["source"] = url return docs def load_web_docs(urls: list[str], timeout=5) -> list[Document]: all_docs = [] for url in urls: print(f"Fetching: {url}") docs_from_url = fetch_html_with_timeout(url, timeout=timeout) all_docs.extend(docs_from_url) return all_docs def load_documents(docs_path=DOCS_PATH): all_docs = [] for file_name in os.listdir(docs_path): file_path = os.path.join(docs_path, file_name) print(f"Processing file: {file_name}") # Debug log # 1) Text files if file_name.lower().endswith(".txt"): print(" -> Loading as .txt") loader = TextLoader(file_path, encoding="utf-8") loaded_docs = loader.load() all_docs.extend(loaded_docs) print(f" -> Loaded {len(loaded_docs)} docs from {file_name}") # 2) PDF elif file_name.lower().endswith(".pdf"): print(" -> Loading as .pdf") loader = PyPDFLoader(file_path) pdf_docs = loader.load_and_split() all_docs.extend(pdf_docs) print(f" -> Loaded {len(pdf_docs)} docs from {file_name}") # 3) URLs elif file_name.lower().endswith(".urls"): print(" -> Loading as .urls") with open(file_path, "r", encoding="utf-8") as f: urls = [line.strip() for line in f if line.strip()] print(f" -> Found {len(urls)} URLs in {file_name}") if urls: web_docs = load_web_docs(urls, timeout=5) print(f" -> Loaded {len(web_docs)} web docs from URLs") all_docs.extend(web_docs) else: print(" -> Skipped: unrecognized file type.") return all_docs def build_faiss_index(): documents = load_documents() text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) splitted_docs = text_splitter.split_documents(documents) embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"}) vectorstore = FAISS.from_documents(splitted_docs, embeddings) os.makedirs(INDEX_PATH, exist_ok=True) vectorstore.save_local(INDEX_PATH) print(f"Vector index saved to {INDEX_PATH}") if __name__ == "__main__": build_faiss_index()