Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
from datasets import load_dataset | |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.vector_stores.faiss import FaissVectorStore | |
import faiss | |
import os | |
from huggingface_hub import login, snapshot_download | |
from huggingface_hub.utils import configure_http_backend | |
from requests.adapters import HTTPAdapter | |
from urllib3.util.retry import Retry | |
import requests | |
app = FastAPI() | |
# Set HF_HOME to a writable directory | |
os.environ["HF_HOME"] = "/app/.cache" | |
# Configure requests with retries and a longer timeout | |
def custom_http_backend(): | |
session = requests.Session() | |
retries = Retry( | |
total=3, # Retry 3 times | |
backoff_factor=1, # Wait 1, 2, 4 seconds between retries | |
status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes | |
) | |
adapter = HTTPAdapter(max_retries=retries) | |
session.mount("https://", adapter) | |
session.timeout = 60 # Increase timeout to 60 seconds | |
return session | |
# Set the custom HTTP backend for huggingface_hub | |
configure_http_backend(backend_factory=custom_http_backend) | |
# Log in to Hugging Face | |
hf_token = os.getenv("HF_TOKEN") | |
if not hf_token: | |
raise ValueError("HF_TOKEN environment variable not set") | |
login(hf_token) | |
# Load Dataset and Prepare Knowledge Base | |
ds = load_dataset("codeparrot/apps", "all", split="train") | |
os.makedirs("knowledge_base", exist_ok=True) | |
for i, example in enumerate(ds.select(range(50))): # Reduced to 50 for memory | |
solution = example['solutions'][0] if example['solutions'] else "No solution available" | |
with open(f"knowledge_base/doc_{i}.txt", "w", encoding="utf-8") as f: | |
f.write(f"### Problem\n{example['question']}\n\n### Solution\n{solution}") | |
documents = SimpleDirectoryReader("knowledge_base").load_data() | |
# Setup RAG | |
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
Settings.embed_model = embed_model | |
d = 384 | |
faiss_index = faiss.IndexFlatL2(d) | |
vector_store = FaissVectorStore(faiss_index=faiss_index) | |
index = VectorStoreIndex.from_documents(documents, vector_store=vector_store) | |
# Load LLaMA Model (without quantization, on CPU) | |
model_name = "meta-llama/Llama-3.2-1B-Instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
device = "cpu" # Force CPU usage | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map="cpu", # Explicitly map to CPU | |
torch_dtype=torch.float32 # Use float32 for CPU compatibility | |
) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
async def solve_problem(problem: str, top_k: int = 1): | |
retriever = index.as_retriever(similarity_top_k=top_k) | |
retrieved_nodes = retriever.retrieve(problem) | |
context = retrieved_nodes[0].text if retrieved_nodes else "No relevant context found." | |
prompt = f"Given the following competitive programming problem:\n\n{problem}\n\nRelevant context:\n{context}\n\nGenerate a solution in Python:" | |
inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True | |
) | |
solution = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return {"solution": solution, "context": context} |