Spaces:

Trigger82
/

Work

Sleeping

App Files Files Community

Trigger82 commited on May 31

Commit

0ff477a

verified ·

1 Parent(s): 41c21f1

Rename main.py to app.py

Browse files

Files changed (2) hide show

app.py +32 -0
main.py +0 -65

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# app.py
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from fastapi import FastAPI
+# Ensure cache env vars point to writable directory (same as Dockerfile)
+home = os.environ.get("HOME", "/home/user")
+cache_dir = os.path.join(home, ".cache", "huggingface")
+os.makedirs(cache_dir, exist_ok=True)
+os.environ["HF_HOME"] = cache_dir
+os.environ["TRANSFORMERS_CACHE"] = cache_dir
+model_id = "rasyosef/Phi-1_5-Instruct-v0.1"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+app = FastAPI()
+@app.get("/chat")
+def chat(query: str):
+    # Compose chat-format prompt (system + user) for Phi-1.5
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+        "<|im_start|>user\n" + query + "<|im_end|>"
+        "<|im_start|>assistant\n"
+    )
+    inputs = tokenizer(prompt, return_tensors="pt")
+    outputs = model.generate(**inputs, max_new_tokens=200)
+    # Decode only the newly generated tokens (skip input tokens)
+    response = tokenizer.decode(
+        outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True
+    )
+    return {"answer": response.strip()}

main.py DELETED Viewed

@@ -1,65 +0,0 @@
-import os
-# Limit parallelism to fit 2 CPU cores
-os.environ["OMP_NUM_THREADS"] = "2"
-os.environ["MKL_NUM_THREADS"] = "2"
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from fastapi import FastAPI, HTTPException
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import gradio as gr
-# Load the Phi-1.5 Instruct model (1.3B) from Hugging Face
-model_id = "rasyosef/Phi-1_5-Instruct-v0.1"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer
-)
-app = FastAPI()
-@app.get("/chat")
-def chat(query: str):
-    """
-    REST API endpoint. Use: GET /chat?query=Your question
-    Returns a JSON {"response": "..."}.
-    """
-    if not query:
-        raise HTTPException(status_code=400, detail="Query parameter 'query' is required.")
-    # Use the same prompt format expected by the model:
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user",   "content": query}
-    ]
-    result = pipe(
-        messages,
-        max_new_tokens=100,
-        do_sample=False,
-        return_full_text=False
-    )
-    answer = result[0]["generated_text"].strip()
-    return {"response": answer}
-# Define Gradio UI (optional)
-def gradio_chat(input_text):
-    if not input_text:
-        return ""
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user",   "content": input_text}
-    ]
-    result = pipe(messages, max_new_tokens=100, do_sample=False, return_full_text=False)
-    return result[0]["generated_text"].strip()
-iface = gr.Interface(
-    fn=gradio_chat,
-    inputs=gr.Textbox(lines=2, placeholder="Type a message..."),
-    outputs="text",
-    title="Phi-1.5 Chatbot",
-    description="Enter a message and press **Submit** to get a response."
-)
-# Mount Gradio at root so it does not conflict with /chat
-app = gr.mount_gradio_app(app, iface, path="/")