# build_index.py
import os
import requests
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredURLLoader, UnstructuredHTMLLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.docstore.document import Document as LCDocument

DOCS_PATH = "docs"
INDEX_PATH = "faiss_index"

def fetch_html_with_timeout(url: str, timeout=5) -> list[Document]:
    """
    Downloads the page content with a timeout, then parses it using UnstructuredHTMLLoader.
    Returns a list of Documents (can be 1 or multiple if you want to split further).
    """
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # raise HTTPError if not 200
    except Exception as e:
        print(f"[Timeout/Fetch Error] Skipping {url}: {e}")
        return []

    # Write the HTML to a temporary file so we can load it with UnstructuredHTMLLoader
    # (unstructured requires a file-like, we can do in-memory, but let's keep it simple)
    temp_filename = "temp_html_file.html"
    with open(temp_filename, "w", encoding="utf-8") as f:
        f.write(response.text)
    
    loader = UnstructuredHTMLLoader(temp_filename)
    docs = loader.load()  # returns a list of Document objects
    for doc in docs:
        doc.metadata["source"] = url
    return docs

def load_web_docs(urls: list[str], timeout=5) -> list[Document]:
    all_docs = []
    for url in urls:
        print(f"Fetching: {url}")
        docs_from_url = fetch_html_with_timeout(url, timeout=timeout)
        all_docs.extend(docs_from_url)
    return all_docs

def load_documents(docs_path=DOCS_PATH):
    all_docs = []

    for file_name in os.listdir(docs_path):
        file_path = os.path.join(docs_path, file_name)
        print(f"Processing file: {file_name}")  # Debug log

        # 1) Text files
        if file_name.lower().endswith(".txt"):
            print(" -> Loading as .txt")
            loader = TextLoader(file_path, encoding="utf-8")
            loaded_docs = loader.load()
            all_docs.extend(loaded_docs)
            print(f" -> Loaded {len(loaded_docs)} docs from {file_name}")

        # 2) PDF
        elif file_name.lower().endswith(".pdf"):
            print(" -> Loading as .pdf")
            loader = PyPDFLoader(file_path)
            pdf_docs = loader.load_and_split()
            all_docs.extend(pdf_docs)
            print(f" -> Loaded {len(pdf_docs)} docs from {file_name}")

        # 3) URLs
        elif file_name.lower().endswith(".urls"):
            print(" -> Loading as .urls")
            with open(file_path, "r", encoding="utf-8") as f:
                urls = [line.strip() for line in f if line.strip()]
            print(f" -> Found {len(urls)} URLs in {file_name}")
            if urls:
                web_docs = load_web_docs(urls, timeout=5)
                print(f" -> Loaded {len(web_docs)} web docs from URLs")
                all_docs.extend(web_docs)

        else:
            print(" -> Skipped: unrecognized file type.")

    return all_docs

def build_faiss_index():
    documents = load_documents()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    splitted_docs = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"})
    vectorstore = FAISS.from_documents(splitted_docs, embeddings)

    os.makedirs(INDEX_PATH, exist_ok=True)
    vectorstore.save_local(INDEX_PATH)
    print(f"Vector index saved to {INDEX_PATH}")

if __name__ == "__main__":
    build_faiss_index()