Spaces:

rathore11
/

PY_LLM_NEW

Paused

App Files Files Community

dharmendra commited on Jul 19

Commit

0242952

1 Parent(s): 73ab258

Added debugging print for Hugging Face token

Browse files

Files changed (1) hide show

app.py +18 -26

app.py CHANGED Viewed

@@ -18,6 +18,13 @@ app = FastAPI()
 # Get the Hugging Face API token from environment variables (BEST PRACTICE)
 HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
 if HUGGINGFACEHUB_API_TOKEN is None:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
@@ -28,9 +35,9 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
-    torch_dtype=torch.bfloat16, # torch.bfloat16 is generally good, can try torch.float16 if issues arise with Mistral
     trust_remote_code=True,
-    token=HUGGINGFACEHUB_API_TOKEN
 )
 if torch.backends.mps.is_available():
@@ -51,13 +58,12 @@ llm = HuggingFacePipeline(pipeline=pipeline(
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=512,  # Allows for longer, detailed answers when required
-    return_full_text=True, # Important for manual slicing of AI's response
-    temperature=0.2,      # Controls randomness (0.0 for deterministic, 1.0 for very creative)
-    do_sample=True,        # Enable sampling for more varied outputs
 ))
 # --- UPDATED PROMPT TEMPLATE ---
-# The Llama-style chat format with <|im_start|> and <|im_end|> is generally compatible with Mistral Instruct models.
 template = """<|im_start|>system
 You are a concise and direct AI assistant named Siddhi.
 You strictly avoid asking any follow-up questions.
@@ -85,64 +91,50 @@ class ChatResponse(BaseModel):
 @app.post("/api/generate")
 async def generate_text(request: QuestionRequest):
     async def generate_stream():
-        # Flag to indicate when we've started streaming the AI's actual response
         started_streaming_ai_response = False
         try:
             response_stream = conversation.stream({"input": request.question})
-            # Define stop sequences for manual checking
             stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
-            assistant_start_marker = "<|im_start|>assistant\n" # Marker from the prompt template
             for chunk in response_stream:
                 full_text_chunk = ""
                 if 'response' in chunk:
                     full_text_chunk = chunk['response']
                 else:
-                    full_text_chunk = str(chunk) # Fallback for unexpected chunk format
-                # Logic to extract only the AI's response from the full text chunk
                 if not started_streaming_ai_response:
                     if assistant_start_marker in full_text_chunk:
-                        # Split the chunk at the assistant's start marker and take the part after it
                         token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
                         started_streaming_ai_response = True
                     else:
-                        # If the marker is not yet in the chunk, this chunk is still part of the prompt.
-                        # We don't yield anything yet.
                         token_content = ""
                 else:
-                    # Once we've started, all subsequent chunks are AI's response
                     token_content = full_text_chunk
-                # --- Manual stopping logic ---
-                # Check if the generated content contains a stop sequence.
-                # If it does, truncate the content and break the loop.
                 for stop_seq in stop_sequences_to_check:
                     if stop_seq in token_content:
-                        token_content = token_content.split(stop_seq, 1)[0] # Truncate at the stop sequence
-                        if token_content: # Yield any content before stop sequence
                             yield json.dumps({"content": token_content}) + "\n"
                             await asyncio.sleep(0.01)
-                        yield json.dumps({"status": "completed"}) + "\n" # Signal completion
-                        return # Exit the generator function
-                # Only yield if there's actual content to send after processing
                 if token_content:
                     yield json.dumps({"content": token_content}) + "\n"
                     await asyncio.sleep(0.01)
-            # Send a final completion message if the stream finishes naturally
             yield json.dumps({"status": "completed"}) + "\n"
         except Exception as e:
             print("Error during streaming generation:")
             traceback.print_exc()
-            # Yield error message in JSON format
             yield json.dumps({"error": str(e)}) + "\n"
-    # Return a StreamingResponse with application/json media type
     return StreamingResponse(generate_stream(), media_type="application/json")
 if __name__ == "__main__":

 # Get the Hugging Face API token from environment variables (BEST PRACTICE)
 HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
+# --- DEBUGGING LINE ADDED ---
+if HUGGINGFACEHUB_API_TOKEN:
+    print(f"HUGGINGFACEHUB_API_TOKEN found: {HUGGINGFACEHUB_API_TOKEN[:5]}...{HUGGINGFACEHUB_API_TOKEN[-5:]}")
+else:
+    print("HUGGINGFACEHUB_API_TOKEN is NOT set in environment variables.")
+# --- END DEBUGGING LINE ---
 if HUGGINGFACEHUB_API_TOKEN is None:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set.")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
+    torch_dtype=torch.bfloat16,
     trust_remote_code=True,
+    token=HUGGINGFACEHUB_API_TOKEN # Ensure the token is passed here
 )
 if torch.backends.mps.is_available():
     model=model,
     tokenizer=tokenizer,
     max_new_tokens=512,  # Allows for longer, detailed answers when required
+    return_full_text=True,
+    temperature=0.2,
+    do_sample=True,
 ))
 # --- UPDATED PROMPT TEMPLATE ---
 template = """<|im_start|>system
 You are a concise and direct AI assistant named Siddhi.
 You strictly avoid asking any follow-up questions.
 @app.post("/api/generate")
 async def generate_text(request: QuestionRequest):
     async def generate_stream():
         started_streaming_ai_response = False
         try:
             response_stream = conversation.stream({"input": request.question})
             stop_sequences_to_check = ["Human:", "AI:", "\nHuman:", "\nAI:", "<|im_end|>"]
+            assistant_start_marker = "<|im_start|>assistant\n"
             for chunk in response_stream:
                 full_text_chunk = ""
                 if 'response' in chunk:
                     full_text_chunk = chunk['response']
                 else:
+                    full_text_chunk = str(chunk)
                 if not started_streaming_ai_response:
                     if assistant_start_marker in full_text_chunk:
                         token_content = full_text_chunk.split(assistant_start_marker, 1)[1]
                         started_streaming_ai_response = True
                     else:
                         token_content = ""
                 else:
                     token_content = full_text_chunk
                 for stop_seq in stop_sequences_to_check:
                     if stop_seq in token_content:
+                        token_content = token_content.split(stop_seq, 1)[0]
+                        if token_content:
                             yield json.dumps({"content": token_content}) + "\n"
                             await asyncio.sleep(0.01)
+                        yield json.dumps({"status": "completed"}) + "\n"
+                        return
                 if token_content:
                     yield json.dumps({"content": token_content}) + "\n"
                     await asyncio.sleep(0.01)
             yield json.dumps({"status": "completed"}) + "\n"
         except Exception as e:
             print("Error during streaming generation:")
             traceback.print_exc()
             yield json.dumps({"error": str(e)}) + "\n"
     return StreamingResponse(generate_stream(), media_type="application/json")
 if __name__ == "__main__":