dharmendra commited on
Commit
0242952
·
1 Parent(s): 73ab258

Added debugging print for Hugging Face token

Browse files
Files changed (1) hide show
  1. app.py +18 -26
app.py CHANGED
@@ -18,6 +18,13 @@ app = FastAPI()
18
  # Get the Hugging Face API token from environment variables (BEST PRACTICE)
19
  HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
20
 
 
 
 
 
 
 
 
21
  if HUGGINGFACEHUB_API_TOKEN is None:
22
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
23
 
@@ -28,9 +35,9 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
28
  model = AutoModelForCausalLM.from_pretrained(
29
  model_id,
30
  device_map="auto",
31
- torch_dtype=torch.bfloat16, # torch.bfloat16 is generally good, can try torch.float16 if issues arise with Mistral
32
  trust_remote_code=True,
33
- token=HUGGINGFACEHUB_API_TOKEN
34
  )
35
 
36
  if torch.backends.mps.is_available():
@@ -51,13 +58,12 @@ llm = HuggingFacePipeline(pipeline=pipeline(
51
  model=model,
52
  tokenizer=tokenizer,
53
  max_new_tokens=512, # Allows for longer, detailed answers when required
54
- return_full_text=True, # Important for manual slicing of AI's response
55
- temperature=0.2, # Controls randomness (0.0 for deterministic, 1.0 for very creative)
56
- do_sample=True, # Enable sampling for more varied outputs
57
  ))
58
 
59
  # --- UPDATED PROMPT TEMPLATE ---
60
- # The Llama-style chat format with <|im_start|> and <|im_end|> is generally compatible with Mistral Instruct models.
61
  template = """<|im_start|>system
62
  You are a concise and direct AI assistant named Siddhi.
63
  You strictly avoid asking any follow-up questions.
@@ -85,64 +91,50 @@ class ChatResponse(BaseModel):
85
  @app.post("/api/generate")
86
  async def generate_text(request: QuestionRequest):
87
  async def generate_stream():
88
- # Flag to indicate when we've started streaming the AI's actual response
89
  started_streaming_ai_response = False
90
 
91
  try:
92
  response_stream = conversation.stream({"input": request.question})
93
 
94
- # Define stop sequences for manual checking
95
  stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
96
- assistant_start_marker = "<|im_start|>assistant\n" # Marker from the prompt template
97
 
98
  for chunk in response_stream:
99
  full_text_chunk = ""
100
  if 'response' in chunk:
101
  full_text_chunk = chunk['response']
102
  else:
103
- full_text_chunk = str(chunk) # Fallback for unexpected chunk format
104
 
105
- # Logic to extract only the AI's response from the full text chunk
106
  if not started_streaming_ai_response:
107
  if assistant_start_marker in full_text_chunk:
108
- # Split the chunk at the assistant's start marker and take the part after it
109
  token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
110
  started_streaming_ai_response = True
111
  else:
112
- # If the marker is not yet in the chunk, this chunk is still part of the prompt.
113
- # We don't yield anything yet.
114
  token_content = ""
115
  else:
116
- # Once we've started, all subsequent chunks are AI's response
117
  token_content = full_text_chunk
118
 
119
- # --- Manual stopping logic ---
120
- # Check if the generated content contains a stop sequence.
121
- # If it does, truncate the content and break the loop.
122
  for stop_seq in stop_sequences_to_check:
123
  if stop_seq in token_content:
124
- token_content = token_content.split(stop_seq, 1)[0] # Truncate at the stop sequence
125
- if token_content: # Yield any content before stop sequence
126
  yield json.dumps({"content": token_content}) + "\n"
127
  await asyncio.sleep(0.01)
128
- yield json.dumps({"status": "completed"}) + "\n" # Signal completion
129
- return # Exit the generator function
130
 
131
- # Only yield if there's actual content to send after processing
132
  if token_content:
133
  yield json.dumps({"content": token_content}) + "\n"
134
  await asyncio.sleep(0.01)
135
 
136
- # Send a final completion message if the stream finishes naturally
137
  yield json.dumps({"status": "completed"}) + "\n"
138
 
139
  except Exception as e:
140
  print("Error during streaming generation:")
141
  traceback.print_exc()
142
- # Yield error message in JSON format
143
  yield json.dumps({"error": str(e)}) + "\n"
144
 
145
- # Return a StreamingResponse with application/json media type
146
  return StreamingResponse(generate_stream(), media_type="application/json")
147
 
148
  if __name__ == "__main__":
 
18
  # Get the Hugging Face API token from environment variables (BEST PRACTICE)
19
  HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
20
 
21
+ # --- DEBUGGING LINE ADDED ---
22
+ if HUGGINGFACEHUB_API_TOKEN:
23
+ print(f"HUGGINGFACEHUB_API_TOKEN found: {HUGGINGFACEHUB_API_TOKEN[:5]}...{HUGGINGFACEHUB_API_TOKEN[-5:]}")
24
+ else:
25
+ print("HUGGINGFACEHUB_API_TOKEN is NOT set in environment variables.")
26
+ # --- END DEBUGGING LINE ---
27
+
28
  if HUGGINGFACEHUB_API_TOKEN is None:
29
  raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
30
 
 
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
  device_map="auto",
38
+ torch_dtype=torch.bfloat16,
39
  trust_remote_code=True,
40
+ token=HUGGINGFACEHUB_API_TOKEN # Ensure the token is passed here
41
  )
42
 
43
  if torch.backends.mps.is_available():
 
58
  model=model,
59
  tokenizer=tokenizer,
60
  max_new_tokens=512, # Allows for longer, detailed answers when required
61
+ return_full_text=True,
62
+ temperature=0.2,
63
+ do_sample=True,
64
  ))
65
 
66
  # --- UPDATED PROMPT TEMPLATE ---
 
67
  template = """<|im_start|>system
68
  You are a concise and direct AI assistant named Siddhi.
69
  You strictly avoid asking any follow-up questions.
 
91
  @app.post("/api/generate")
92
  async def generate_text(request: QuestionRequest):
93
  async def generate_stream():
 
94
  started_streaming_ai_response = False
95
 
96
  try:
97
  response_stream = conversation.stream({"input": request.question})
98
 
 
99
  stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
100
+ assistant_start_marker = "<|im_start|>assistant\n"
101
 
102
  for chunk in response_stream:
103
  full_text_chunk = ""
104
  if 'response' in chunk:
105
  full_text_chunk = chunk['response']
106
  else:
107
+ full_text_chunk = str(chunk)
108
 
 
109
  if not started_streaming_ai_response:
110
  if assistant_start_marker in full_text_chunk:
 
111
  token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
112
  started_streaming_ai_response = True
113
  else:
 
 
114
  token_content = ""
115
  else:
 
116
  token_content = full_text_chunk
117
 
 
 
 
118
  for stop_seq in stop_sequences_to_check:
119
  if stop_seq in token_content:
120
+ token_content = token_content.split(stop_seq, 1)[0]
121
+ if token_content:
122
  yield json.dumps({"content": token_content}) + "\n"
123
  await asyncio.sleep(0.01)
124
+ yield json.dumps({"status": "completed"}) + "\n"
125
+ return
126
 
 
127
  if token_content:
128
  yield json.dumps({"content": token_content}) + "\n"
129
  await asyncio.sleep(0.01)
130
 
 
131
  yield json.dumps({"status": "completed"}) + "\n"
132
 
133
  except Exception as e:
134
  print("Error during streaming generation:")
135
  traceback.print_exc()
 
136
  yield json.dumps({"error": str(e)}) + "\n"
137
 
 
138
  return StreamingResponse(generate_stream(), media_type="application/json")
139
 
140
  if __name__ == "__main__":