Spaces:

lilmeaty
/

Ghcg

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Jan 31

Commit

395fc4a

verified ·

1 Parent(s): 6538615

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -73

app.py CHANGED Viewed

@@ -5,18 +5,19 @@ from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, field_validator
 from transformers import (
     AutoConfig,
-    pipeline,
     AutoModelForCausalLM,
     AutoTokenizer,
     GenerationConfig,
     StoppingCriteriaList,
     StoppingCriteria,
-    TextStreamer
 )
 import uvicorn
 import asyncio
 from io import BytesIO
-from transformers import pipeline
 app = FastAPI()
@@ -26,7 +27,7 @@ class GenerateRequest(BaseModel):
     task_type: str
     temperature: float = 1.0
     max_new_tokens: int = 2
-    stream: bool = True # Keep stream parameter in request for flexibility, but handle it correctly in code
     top_p: float = 1.0
     top_k: int = 50
     repetition_penalty: float = 1.0
@@ -89,7 +90,7 @@ async def generate(request: GenerateRequest):
         task_type = request.task_type
         temperature = request.temperature
         max_new_tokens = request.max_new_tokens
-        stream = request.stream # Get stream from request, but handle correctly
         top_p = request.top_p
         top_k = request.top_k
         repetition_penalty = request.repetition_penalty
@@ -117,71 +118,50 @@ async def generate(request: GenerateRequest):
             stop_token_ids = tokenizer.convert_tokens_to_ids(stop_sequences)
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
-        if stream: # Handle streaming based on request parameter
             return StreamingResponse(
                 stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
                 media_type="text/plain"
             )
-        else: # Handle non-streaming case
             generated_text = generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device)
-            return StreamingResponse(iter([generated_text]), media_type="text/plain") # Still use StreamingResponse for consistency in return type
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
-    streamer = TextStreamer(tokenizer) # Use TextStreamer for proper streaming
-    with torch.no_grad():
-        model.generate(
-            **encoded_input,
-            generation_config=generation_config,
-            stopping_criteria=stopping_criteria_list,
-            streamer=streamer, # Use streamer here instead of stream=True
-            return_dict_in_generate=True,
-            output_scores=True
-        )
-        # TextStreamer handles printing to stdout by default, but we want to stream to client
-        # We need to access the generated text from the streamer and yield it.
-        # TextStreamer is designed for terminal output, not direct access to tokens for streaming.
-        # We need to modify stream_text to correctly stream tokens.
-        encoded_input_len = encoded_input["input_ids"].shape[-1]
-        generated_tokens = []
-        for i, output in enumerate(model.generate(
-            **encoded_input,
-            generation_config=generation_config,
-            stopping_criteria=stopping_criteria_list,
-            stream=True, # Keep stream=True for actual streaming from model
-            return_dict_in_generate=True,
-            output_scores=True,
-            ):
-            if i > 0: # Skip the first output which is just input
-                new_tokens = output.sequences[:, encoded_input_len:]
-                for token_batch in new_tokens:
-                    token = tokenizer.decode(token_batch, skip_special_tokens=True)
-                    if token:
-                        yield token
-                        await asyncio.sleep(chunk_delay)
 async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
-    with torch.no_grad():
-        output = model.generate(
-            **encoded_input,
-            generation_config=generation_config,
-            stopping_criteria=stopping_criteria_list,
-            return_dict_in_generate=True,
-            output_scores=True
-        )
-        generated_text = tokenizer.decode(output.sequences[0][encoded_input["input_ids"].shape[-1]:], skip_special_tokens=True)
-        return generated_text
 @app.post("/generate-image")
@@ -209,24 +189,17 @@ async def generate_text_to_speech(request: GenerateRequest):
         device = "cuda" if torch.cuda.is_available() else "cpu"
         audio_generator = pipeline("text-to-speech", model=validated_body.model_name, device=device)
-        audio = audio_generator(validated_body.input_text)[0]
         audio_byte_arr = BytesIO()
-        # Assuming audio_generator returns an object with a save method. Adjust based on actual object.
-        # Example for a hypothetical audio object with save method to BytesIO
-        # audio_generator output might vary, check documentation for the specific model/pipeline
-        # If it's raw audio data, you might need to use a library like soundfile to write to BytesIO
-        # Example assuming `audio` is raw data and needs to be saved as wav
-        import soundfile as sf
-        sf.write(audio_byte_arr, audio, samplerate=audio_generator.sampling_rate, format='WAV') # Assuming samplerate exists
         audio_byte_arr.seek(0)
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
-        import traceback
-        traceback.print_exc() # Print detailed error for debugging
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
@@ -235,21 +208,16 @@ async def generate_video(request: GenerateRequest):
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
         video_generator = pipeline("text-to-video", model=validated_body.model_name, device=device)
-        video = video_generator(validated_body.input_text)[0]
         video_byte_arr = BytesIO()
-        # Assuming video_generator returns an object with a save method. Adjust based on actual object and format.
-        # Example for a hypothetical video object with save method to BytesIO as mp4
-        # video_generator output might vary, check documentation for the specific model/pipeline
-        video.save(video_byte_arr, format='MP4') # Hypothetical save method, adjust based on actual video object
         video_byte_arr.seek(0)
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
-        import traceback
-        traceback.print_exc() # Print detailed error for debugging
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":

 from pydantic import BaseModel, field_validator
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
     GenerationConfig,
     StoppingCriteriaList,
     StoppingCriteria,
+    TextStreamer,
+    pipeline
 )
 import uvicorn
 import asyncio
 from io import BytesIO
+import soundfile as sf
+import traceback
 app = FastAPI()
     task_type: str
     temperature: float = 1.0
     max_new_tokens: int = 2
+    stream: bool = True
     top_p: float = 1.0
     top_k: int = 50
     repetition_penalty: float = 1.0
         task_type = request.task_type
         temperature = request.temperature
         max_new_tokens = request.max_new_tokens
+        stream = request.stream
         top_p = request.top_p
         top_k = request.top_k
         repetition_penalty = request.repetition_penalty
             stop_token_ids = tokenizer.convert_tokens_to_ids(stop_sequences)
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
+        if stream:
             return StreamingResponse(
                 stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
                 media_type="text/plain"
             )
+        else:
             generated_text = generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device)
+            return StreamingResponse(iter([generated_text]), media_type="text/plain")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    encoded_input_len = encoded_input["input_ids"].shape[-1]
+    for output in model.generate(
+        **encoded_input,
+        generation_config=generation_config,
+        stopping_criteria=stopping_criteria_list,
+        stream=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+    ):
+        new_tokens = output.sequences[:, encoded_input_len:]
+        for token_batch in new_tokens:
+            token = tokenizer.decode(token_batch, skip_special_tokens=True)
+            if token:
+                yield token
+                await asyncio.sleep(chunk_delay)
 async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    output = model.generate(
+        **encoded_input,
+        generation_config=generation_config,
+        stopping_criteria=stopping_criteria_list,
+        return_dict_in_generate=True,
+        output_scores=True
+    )
+    generated_text = tokenizer.decode(output.sequences[0][encoded_input["input_ids"].shape[-1]:], skip_special_tokens=True)
+    return generated_text
 @app.post("/generate-image")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         audio_generator = pipeline("text-to-speech", model=validated_body.model_name, device=device)
+        audio = audio_generator(validated_body.input_text)
+        sampling_rate = audio_generator.sampling_rate
         audio_byte_arr = BytesIO()
+        sf.write(audio_byte_arr, audio, sampling_rate, format='WAV')
         audio_byte_arr.seek(0)
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
+        traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/generate-video")
         validated_body = request
         device = "cuda" if torch.cuda.is_available() else "cpu"
         video_generator = pipeline("text-to-video", model=validated_body.model_name, device=device)
+        video = video_generator(validated_body.input_text)
         video_byte_arr = BytesIO()
+        video.save(video_byte_arr)
         video_byte_arr.seek(0)
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
+        traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":