GuhanAein's picture
Update main.py
98e2f27 verified
from fastapi import FastAPI
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import load_dataset
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss
import os
from huggingface_hub import login, snapshot_download
from huggingface_hub.utils import configure_http_backend
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import requests
app = FastAPI()
# Set HF_HOME to a writable directory
os.environ["HF_HOME"] = "/app/.cache"
# Configure requests with retries and a longer timeout
def custom_http_backend():
session = requests.Session()
retries = Retry(
total=3, # Retry 3 times
backoff_factor=1, # Wait 1, 2, 4 seconds between retries
status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("https://", adapter)
session.timeout = 60 # Increase timeout to 60 seconds
return session
# Set the custom HTTP backend for huggingface_hub
configure_http_backend(backend_factory=custom_http_backend)
# Log in to Hugging Face
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN environment variable not set")
login(hf_token)
# Load Dataset and Prepare Knowledge Base
ds = load_dataset("codeparrot/apps", "all", split="train")
os.makedirs("knowledge_base", exist_ok=True)
for i, example in enumerate(ds.select(range(50))): # Reduced to 50 for memory
solution = example['solutions'][0] if example['solutions'] else "No solution available"
with open(f"knowledge_base/doc_{i}.txt", "w", encoding="utf-8") as f:
f.write(f"### Problem\n{example['question']}\n\n### Solution\n{solution}")
documents = SimpleDirectoryReader("knowledge_base").load_data()
# Setup RAG
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
Settings.embed_model = embed_model
d = 384
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
index = VectorStoreIndex.from_documents(documents, vector_store=vector_store)
# Load LLaMA Model (without quantization, on CPU)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cpu" # Force CPU usage
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="cpu", # Explicitly map to CPU
torch_dtype=torch.float32 # Use float32 for CPU compatibility
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
@app.get("/solve")
async def solve_problem(problem: str, top_k: int = 1):
retriever = index.as_retriever(similarity_top_k=top_k)
retrieved_nodes = retriever.retrieve(problem)
context = retrieved_nodes[0].text if retrieved_nodes else "No relevant context found."
prompt = f"Given the following competitive programming problem:\n\n{problem}\n\nRelevant context:\n{context}\n\nGenerate a solution in Python:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
top_p=0.9,
do_sample=True
)
solution = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"solution": solution, "context": context}