from fastapi import FastAPI from transformers import AutoModelForCausalLM, AutoTokenizer import torch from datasets import load_dataset from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.faiss import FaissVectorStore import faiss import os from huggingface_hub import login, snapshot_download from huggingface_hub.utils import configure_http_backend from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import requests app = FastAPI() # Set HF_HOME to a writable directory os.environ["HF_HOME"] = "/app/.cache" # Configure requests with retries and a longer timeout def custom_http_backend(): session = requests.Session() retries = Retry( total=3, # Retry 3 times backoff_factor=1, # Wait 1, 2, 4 seconds between retries status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes ) adapter = HTTPAdapter(max_retries=retries) session.mount("https://", adapter) session.timeout = 60 # Increase timeout to 60 seconds return session # Set the custom HTTP backend for huggingface_hub configure_http_backend(backend_factory=custom_http_backend) # Log in to Hugging Face hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN environment variable not set") login(hf_token) # Load Dataset and Prepare Knowledge Base ds = load_dataset("codeparrot/apps", "all", split="train") os.makedirs("knowledge_base", exist_ok=True) for i, example in enumerate(ds.select(range(50))): # Reduced to 50 for memory solution = example['solutions'][0] if example['solutions'] else "No solution available" with open(f"knowledge_base/doc_{i}.txt", "w", encoding="utf-8") as f: f.write(f"### Problem\n{example['question']}\n\n### Solution\n{solution}") documents = SimpleDirectoryReader("knowledge_base").load_data() # Setup RAG embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") Settings.embed_model = embed_model d = 384 faiss_index = faiss.IndexFlatL2(d) vector_store = FaissVectorStore(faiss_index=faiss_index) index = VectorStoreIndex.from_documents(documents, vector_store=vector_store) # Load LLaMA Model (without quantization, on CPU) model_name = "meta-llama/Llama-3.2-1B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) device = "cpu" # Force CPU usage model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", # Explicitly map to CPU torch_dtype=torch.float32 # Use float32 for CPU compatibility ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @app.get("/solve") async def solve_problem(problem: str, top_k: int = 1): retriever = index.as_retriever(similarity_top_k=top_k) retrieved_nodes = retriever.retrieve(problem) context = retrieved_nodes[0].text if retrieved_nodes else "No relevant context found." prompt = f"Given the following competitive programming problem:\n\n{problem}\n\nRelevant context:\n{context}\n\nGenerate a solution in Python:" inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate( **inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True ) solution = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"solution": solution, "context": context}