Spaces:

lilmeaty
/

Ghcg

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Jan 31

Commit

6538615

verified ·

1 Parent(s): 5ded4bc

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -16

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ from transformers import (
     AutoTokenizer,
     GenerationConfig,
     StoppingCriteriaList,
-    StoppingCriteria
 )
 import uvicorn
 import asyncio
@@ -25,7 +26,7 @@ class GenerateRequest(BaseModel):
     task_type: str
     temperature: float = 1.0
     max_new_tokens: int = 2
-    stream: bool = True
     top_p: float = 1.0
     top_k: int = 50
     repetition_penalty: float = 1.0
@@ -88,7 +89,7 @@ async def generate(request: GenerateRequest):
         task_type = request.task_type
         temperature = request.temperature
         max_new_tokens = request.max_new_tokens
-        stream = request.stream
         top_p = request.top_p
         top_k = request.top_k
         repetition_penalty = request.repetition_penalty
@@ -117,32 +118,70 @@ async def generate(request: GenerateRequest):
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
-        return StreamingResponse(
-            stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
-            media_type="text/plain"
-        )
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     with torch.no_grad():
-        streamer = model.generate(
             **encoded_input,
             generation_config=generation_config,
             stopping_criteria=stopping_criteria_list,
-            stream=True, # Ensure streaming is enabled if supported by the model
             return_dict_in_generate=True,
             output_scores=True
         )
-        for output in streamer.sequences[:, encoded_input["input_ids"].shape[-1]:]: # Stream from the *new* tokens
-            token = tokenizer.decode(output, skip_special_tokens=True)
-            if token: # Avoid yielding empty tokens
-                yield token
-                await asyncio.sleep(chunk_delay)
 @app.post("/generate-image")
@@ -173,12 +212,21 @@ async def generate_text_to_speech(request: GenerateRequest):
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
-        audio.save(audio_byte_arr)
         audio_byte_arr.seek(0)
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
@@ -190,12 +238,18 @@ async def generate_video(request: GenerateRequest):
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
-        video.save(video_byte_arr)
         video_byte_arr.seek(0)
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

     AutoTokenizer,
     GenerationConfig,
     StoppingCriteriaList,
+    StoppingCriteria,
+    TextStreamer
 )
 import uvicorn
 import asyncio
     task_type: str
     temperature: float = 1.0
     max_new_tokens: int = 2
+    stream: bool = True # Keep stream parameter in request for flexibility, but handle it correctly in code
     top_p: float = 1.0
     top_k: int = 50
     repetition_penalty: float = 1.0
         task_type = request.task_type
         temperature = request.temperature
         max_new_tokens = request.max_new_tokens
+        stream = request.stream # Get stream from request, but handle correctly
         top_p = request.top_p
         top_k = request.top_k
         repetition_penalty = request.repetition_penalty
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
+        if stream: # Handle streaming based on request parameter
+            return StreamingResponse(
+                stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
+                media_type="text/plain"
+            )
+        else: # Handle non-streaming case
+            generated_text = generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device)
+            return StreamingResponse(iter([generated_text]), media_type="text/plain") # Still use StreamingResponse for consistency in return type
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    streamer = TextStreamer(tokenizer) # Use TextStreamer for proper streaming
     with torch.no_grad():
+        model.generate(
             **encoded_input,
             generation_config=generation_config,
             stopping_criteria=stopping_criteria_list,
+            streamer=streamer, # Use streamer here instead of stream=True
             return_dict_in_generate=True,
             output_scores=True
         )
+        # TextStreamer handles printing to stdout by default, but we want to stream to client
+        # We need to access the generated text from the streamer and yield it.
+        # TextStreamer is designed for terminal output, not direct access to tokens for streaming.
+        # We need to modify stream_text to correctly stream tokens.
+        encoded_input_len = encoded_input["input_ids"].shape[-1]
+        generated_tokens = []
+        for i, output in enumerate(model.generate(
+            **encoded_input,
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria_list,
+            stream=True, # Keep stream=True for actual streaming from model
+            return_dict_in_generate=True,
+            output_scores=True,
+            ):
+            if i > 0: # Skip the first output which is just input
+                new_tokens = output.sequences[:, encoded_input_len:]
+                for token_batch in new_tokens:
+                    token = tokenizer.decode(token_batch, skip_special_tokens=True)
+                    if token:
+                        yield token
+                        await asyncio.sleep(chunk_delay)
+async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=2048):
+    encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    with torch.no_grad():
+        output = model.generate(
+            **encoded_input,
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria_list,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
+        generated_text = tokenizer.decode(output.sequences[0][encoded_input["input_ids"].shape[-1]:], skip_special_tokens=True)
+        return generated_text
 @app.post("/generate-image")
         audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
+        # Assuming audio_generator returns an object with a save method. Adjust based on actual object.
+        # Example for a hypothetical audio object with save method to BytesIO
+        # audio_generator output might vary, check documentation for the specific model/pipeline
+        # If it's raw audio data, you might need to use a library like soundfile to write to BytesIO
+        # Example assuming `audio` is raw data and needs to be saved as wav
+        import soundfile as sf
+        sf.write(audio_byte_arr, audio, samplerate=audio_generator.sampling_rate, format='WAV') # Assuming samplerate exists
         audio_byte_arr.seek(0)
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
+        import traceback
+        traceback.print_exc() # Print detailed error for debugging
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
         video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
+        # Assuming video_generator returns an object with a save method. Adjust based on actual object and format.
+        # Example for a hypothetical video object with save method to BytesIO as mp4
+        # video_generator output might vary, check documentation for the specific model/pipeline
+        video.save(video_byte_arr, format='MP4') # Hypothetical save method, adjust based on actual video object
         video_byte_arr.seek(0)
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
+        import traceback
+        traceback.print_exc() # Print detailed error for debugging
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":