Spaces:
Paused
Paused
dharmendra
commited on
Commit
·
0242952
1
Parent(s):
73ab258
Added debugging print for Hugging Face token
Browse files
app.py
CHANGED
@@ -18,6 +18,13 @@ app = FastAPI()
|
|
18 |
# Get the Hugging Face API token from environment variables (BEST PRACTICE)
|
19 |
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
if HUGGINGFACEHUB_API_TOKEN is None:
|
22 |
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
|
23 |
|
@@ -28,9 +35,9 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
28 |
model = AutoModelForCausalLM.from_pretrained(
|
29 |
model_id,
|
30 |
device_map="auto",
|
31 |
-
torch_dtype=torch.bfloat16,
|
32 |
trust_remote_code=True,
|
33 |
-
token=HUGGINGFACEHUB_API_TOKEN
|
34 |
)
|
35 |
|
36 |
if torch.backends.mps.is_available():
|
@@ -51,13 +58,12 @@ llm = HuggingFacePipeline(pipeline=pipeline(
|
|
51 |
model=model,
|
52 |
tokenizer=tokenizer,
|
53 |
max_new_tokens=512, # Allows for longer, detailed answers when required
|
54 |
-
return_full_text=True,
|
55 |
-
temperature=0.2,
|
56 |
-
do_sample=True,
|
57 |
))
|
58 |
|
59 |
# --- UPDATED PROMPT TEMPLATE ---
|
60 |
-
# The Llama-style chat format with <|im_start|> and <|im_end|> is generally compatible with Mistral Instruct models.
|
61 |
template = """<|im_start|>system
|
62 |
You are a concise and direct AI assistant named Siddhi.
|
63 |
You strictly avoid asking any follow-up questions.
|
@@ -85,64 +91,50 @@ class ChatResponse(BaseModel):
|
|
85 |
@app.post("/api/generate")
|
86 |
async def generate_text(request: QuestionRequest):
|
87 |
async def generate_stream():
|
88 |
-
# Flag to indicate when we've started streaming the AI's actual response
|
89 |
started_streaming_ai_response = False
|
90 |
|
91 |
try:
|
92 |
response_stream = conversation.stream({"input": request.question})
|
93 |
|
94 |
-
# Define stop sequences for manual checking
|
95 |
stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
|
96 |
-
assistant_start_marker = "<|im_start|>assistant\n"
|
97 |
|
98 |
for chunk in response_stream:
|
99 |
full_text_chunk = ""
|
100 |
if 'response' in chunk:
|
101 |
full_text_chunk = chunk['response']
|
102 |
else:
|
103 |
-
full_text_chunk = str(chunk)
|
104 |
|
105 |
-
# Logic to extract only the AI's response from the full text chunk
|
106 |
if not started_streaming_ai_response:
|
107 |
if assistant_start_marker in full_text_chunk:
|
108 |
-
# Split the chunk at the assistant's start marker and take the part after it
|
109 |
token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
|
110 |
started_streaming_ai_response = True
|
111 |
else:
|
112 |
-
# If the marker is not yet in the chunk, this chunk is still part of the prompt.
|
113 |
-
# We don't yield anything yet.
|
114 |
token_content = ""
|
115 |
else:
|
116 |
-
# Once we've started, all subsequent chunks are AI's response
|
117 |
token_content = full_text_chunk
|
118 |
|
119 |
-
# --- Manual stopping logic ---
|
120 |
-
# Check if the generated content contains a stop sequence.
|
121 |
-
# If it does, truncate the content and break the loop.
|
122 |
for stop_seq in stop_sequences_to_check:
|
123 |
if stop_seq in token_content:
|
124 |
-
token_content = token_content.split(stop_seq, 1)[0]
|
125 |
-
if token_content:
|
126 |
yield json.dumps({"content": token_content}) + "\n"
|
127 |
await asyncio.sleep(0.01)
|
128 |
-
yield json.dumps({"status": "completed"}) + "\n"
|
129 |
-
return
|
130 |
|
131 |
-
# Only yield if there's actual content to send after processing
|
132 |
if token_content:
|
133 |
yield json.dumps({"content": token_content}) + "\n"
|
134 |
await asyncio.sleep(0.01)
|
135 |
|
136 |
-
# Send a final completion message if the stream finishes naturally
|
137 |
yield json.dumps({"status": "completed"}) + "\n"
|
138 |
|
139 |
except Exception as e:
|
140 |
print("Error during streaming generation:")
|
141 |
traceback.print_exc()
|
142 |
-
# Yield error message in JSON format
|
143 |
yield json.dumps({"error": str(e)}) + "\n"
|
144 |
|
145 |
-
# Return a StreamingResponse with application/json media type
|
146 |
return StreamingResponse(generate_stream(), media_type="application/json")
|
147 |
|
148 |
if __name__ == "__main__":
|
|
|
18 |
# Get the Hugging Face API token from environment variables (BEST PRACTICE)
|
19 |
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
|
20 |
|
21 |
+
# --- DEBUGGING LINE ADDED ---
|
22 |
+
if HUGGINGFACEHUB_API_TOKEN:
|
23 |
+
print(f"HUGGINGFACEHUB_API_TOKEN found: {HUGGINGFACEHUB_API_TOKEN[:5]}...{HUGGINGFACEHUB_API_TOKEN[-5:]}")
|
24 |
+
else:
|
25 |
+
print("HUGGINGFACEHUB_API_TOKEN is NOT set in environment variables.")
|
26 |
+
# --- END DEBUGGING LINE ---
|
27 |
+
|
28 |
if HUGGINGFACEHUB_API_TOKEN is None:
|
29 |
raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
|
30 |
|
|
|
35 |
model = AutoModelForCausalLM.from_pretrained(
|
36 |
model_id,
|
37 |
device_map="auto",
|
38 |
+
torch_dtype=torch.bfloat16,
|
39 |
trust_remote_code=True,
|
40 |
+
token=HUGGINGFACEHUB_API_TOKEN # Ensure the token is passed here
|
41 |
)
|
42 |
|
43 |
if torch.backends.mps.is_available():
|
|
|
58 |
model=model,
|
59 |
tokenizer=tokenizer,
|
60 |
max_new_tokens=512, # Allows for longer, detailed answers when required
|
61 |
+
return_full_text=True,
|
62 |
+
temperature=0.2,
|
63 |
+
do_sample=True,
|
64 |
))
|
65 |
|
66 |
# --- UPDATED PROMPT TEMPLATE ---
|
|
|
67 |
template = """<|im_start|>system
|
68 |
You are a concise and direct AI assistant named Siddhi.
|
69 |
You strictly avoid asking any follow-up questions.
|
|
|
91 |
@app.post("/api/generate")
|
92 |
async def generate_text(request: QuestionRequest):
|
93 |
async def generate_stream():
|
|
|
94 |
started_streaming_ai_response = False
|
95 |
|
96 |
try:
|
97 |
response_stream = conversation.stream({"input": request.question})
|
98 |
|
|
|
99 |
stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
|
100 |
+
assistant_start_marker = "<|im_start|>assistant\n"
|
101 |
|
102 |
for chunk in response_stream:
|
103 |
full_text_chunk = ""
|
104 |
if 'response' in chunk:
|
105 |
full_text_chunk = chunk['response']
|
106 |
else:
|
107 |
+
full_text_chunk = str(chunk)
|
108 |
|
|
|
109 |
if not started_streaming_ai_response:
|
110 |
if assistant_start_marker in full_text_chunk:
|
|
|
111 |
token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
|
112 |
started_streaming_ai_response = True
|
113 |
else:
|
|
|
|
|
114 |
token_content = ""
|
115 |
else:
|
|
|
116 |
token_content = full_text_chunk
|
117 |
|
|
|
|
|
|
|
118 |
for stop_seq in stop_sequences_to_check:
|
119 |
if stop_seq in token_content:
|
120 |
+
token_content = token_content.split(stop_seq, 1)[0]
|
121 |
+
if token_content:
|
122 |
yield json.dumps({"content": token_content}) + "\n"
|
123 |
await asyncio.sleep(0.01)
|
124 |
+
yield json.dumps({"status": "completed"}) + "\n"
|
125 |
+
return
|
126 |
|
|
|
127 |
if token_content:
|
128 |
yield json.dumps({"content": token_content}) + "\n"
|
129 |
await asyncio.sleep(0.01)
|
130 |
|
|
|
131 |
yield json.dumps({"status": "completed"}) + "\n"
|
132 |
|
133 |
except Exception as e:
|
134 |
print("Error during streaming generation:")
|
135 |
traceback.print_exc()
|
|
|
136 |
yield json.dumps({"error": str(e)}) + "\n"
|
137 |
|
|
|
138 |
return StreamingResponse(generate_stream(), media_type="application/json")
|
139 |
|
140 |
if __name__ == "__main__":
|