Spaces:

lilmeaty
/

Ghcg

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Feb 6

Commit

f9fbc8e

verified ·

1 Parent(s): f40b225

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -19

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ try:
     import psutil
     import resource
     total_memory = psutil.virtual_memory().total
-    limit = int(total_memory * 80.0)  # 1% del total en bytes # Corrección: Usar 0.01 para 1%
     resource.setrlimit(resource.RLIMIT_AS, (limit, limit))
     print(f"Memory limit set to {limit} bytes (1% of total system memory).") # Imprimir para verificar el límite aplicado
 except Exception as e:
@@ -55,6 +55,7 @@ class GenerateRequest(BaseModel):
     do_sample: bool = True
     chunk_delay: float = 0.0
     stop_sequences: list[str] = []
     @field_validator("model_name")
     def model_name_cannot_be_empty(cls, v):
@@ -119,6 +120,7 @@ async def generate(request: GenerateRequest):
         do_sample = request.do_sample
         chunk_delay = request.chunk_delay
         stop_sequences = request.stop_sequences
         model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -143,7 +145,7 @@ async def generate(request: GenerateRequest):
         if stream:
             # Se utiliza StreamingResponse con la función asíncrona que envía cada token en tiempo real.
             response = StreamingResponse(
-                stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
                 media_type="text/plain"
             )
         else:
@@ -155,31 +157,83 @@ async def generate(request: GenerateRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=64):
     """
-    Genera tokens de forma asíncrona y los envía al cliente en tiempo real.
     """
     # Limitar la entrada para minimizar el uso de memoria
-    encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     # Con torch.no_grad() se evita almacenar información para gradientes
     with torch.no_grad():
-        # Se genera el texto de forma iterativa (streaming)
-        for output in model.generate(
-            **encoded_input,
-            generation_config=generation_config,
-            stopping_criteria=stopping_criteria_list,
-            # return_dict_in_generate=True, # Remove return_dict_in_generate for streaming
-            # output_scores=True, # output_scores might not be needed for streaming text only
-        ):
-            # In streaming mode, output is directly the generated token IDs
-            token = tokenizer.decode(output, skip_special_tokens=True)
-            if token:
-                # Se envía cada token inmediatamente
-                yield token
-                await asyncio.sleep(chunk_delay)
     await cleanup_memory(device)
 async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=64):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     with torch.no_grad():

     import psutil
     import resource
     total_memory = psutil.virtual_memory().total
+    limit = int(total_memory * 90.0)  # 1% del total en bytes # Corrección: Usar 0.01 para 1%
     resource.setrlimit(resource.RLIMIT_AS, (limit, limit))
     print(f"Memory limit set to {limit} bytes (1% of total system memory).") # Imprimir para verificar el límite aplicado
 except Exception as e:
     do_sample: bool = True
     chunk_delay: float = 0.0
     stop_sequences: list[str] = []
+    chunk_token_limit: int = 100 # Nuevo parámetro para limitar tokens por chunk
     @field_validator("model_name")
     def model_name_cannot_be_empty(cls, v):
         do_sample = request.do_sample
         chunk_delay = request.chunk_delay
         stop_sequences = request.stop_sequences
+        chunk_token_limit = request.chunk_token_limit
         model, tokenizer = await model_loader.load_model_and_tokenizer(model_name)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         if stream:
             # Se utiliza StreamingResponse con la función asíncrona que envía cada token en tiempo real.
             response = StreamingResponse(
+                stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, chunk_token_limit, stopping_criteria_list), # Pass stopping_criteria_list
                 media_type="text/plain"
             )
         else:
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, chunk_token_limit, stop_criteria): # Accept stop_criteria
     """
+    Genera tokens de forma asíncrona y los envía al cliente en tiempo real, dividiendo la respuesta en chunks si excede el límite de tokens.
+    La generación se detiene automáticamente al cumplirse los StoppingCriteriaList.
     """
     # Limitar la entrada para minimizar el uso de memoria
+    encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=64).to(device)
+    current_chunk_tokens = 0
+    current_chunk_text = ""
+    past_key_values = None # To maintain state for streaming
     # Con torch.no_grad() se evita almacenar información para gradientes
     with torch.no_grad():
+        input_ids = encoded_input.input_ids
+        # Generación manual token por token para control de parada y chunking
+        while True: # Bucle infinito que se rompe por condiciones de parada
+            outputs = model(
+                input_ids,
+                past_key_values=past_key_values,
+                use_cache=True, # Important for stateful generation
+                return_dict=True
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # Aplicar sampling para obtener el siguiente token (igual que en generation_config)
+            if generation_config.do_sample:
+                # Apply temperature and Top-p/Top-k sampling
+                next_token_logits = next_token_logits / generation_config.temperature
+                # Top-k filtering
+                if generation_config.top_k is not None and generation_config.top_k > 0:
+                    v, _ = torch.topk(next_token_logits, min(generation_config.top_k, next_token_logits.size(-1)))
+                    next_token_logits[next_token_logits < v[:, [-1]]] = -float('Inf')
+                probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                # Greedy decoding
+                next_tokens = torch.argmax(next_token_logits, dim=-1)
+            # Check stop criteria BEFORE adding token to output
+            if stop_criteria and stop_criteria(input_ids, next_token_logits): # Check stopping criteria
+                break # Stop generation if criteria is met
+            next_tokens = next_tokens.unsqueeze(0) # Reshape to [1, 1] for concat
+            next_token_text = tokenizer.decode(next_tokens[0], skip_special_tokens=True)
+            token_count = len(tokenizer.encode(current_chunk_text + next_token_text)) - len(tokenizer.encode(current_chunk_text))
+            if current_chunk_tokens + token_count > chunk_token_limit:
+                yield current_chunk_text
+                current_chunk_text = next_token_text
+                current_chunk_tokens = token_count
+            else:
+                current_chunk_text += next_token_text
+                current_chunk_tokens += token_count
+            yield current_chunk_text # Yield every token/chunk
+            input_ids = torch.cat([input_ids, next_tokens], dim=-1) # Append next token to input_ids for next iteration
+            past_key_values = outputs.past_key_values # Update past key values for stateful generation
+            await asyncio.sleep(chunk_delay)
+            if input_ids.shape[-1] >= generation_config.max_new_tokens + encoded_input.input_ids.shape[-1]: # Check max_new_tokens limit
+                break # Stop if max_new_tokens is reached
+    # Asegurar de enviar el último chunk
+    if current_chunk_text:
+        yield current_chunk_text
     await cleanup_memory(device)
 async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=64):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     with torch.no_grad():