Spaces:

multimodalart
/

self-forcing

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 1 day ago

Commit

f29396c

verified ·

1 Parent(s): 076e3f4

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -11

app.py CHANGED Viewed

@@ -40,28 +40,68 @@ from demo_utils.constant import ZERO_VAE_CACHE
 from demo_utils.vae_block3 import VAEDecoderWrapper
 from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model_checkpoint = "gokaygokay/Flux-Prompt-Enhance"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
 enhancer = pipeline(
-    'text2text-generation',
     model=model,
     tokenizer=tokenizer,
-    repetition_penalty= 1.2,
-    device=device
 )
-max_target_length = 256
 @spaces.GPU
 def enhance_prompt(prompt):
-    prefix = "enhance prompt: "
-    short_prompt = prompt
-    answer = enhancer(prefix + short_prompt, max_length=max_target_length)
     final_answer = answer[0]['generated_text']
-    return final_answer
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")

 from demo_utils.vae_block3 import VAEDecoderWrapper
 from utils.wan_wrapper import WanDiffusionWrapper, WanTextEncoder
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model_checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_checkpoint,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    quantization_config=quantization_config,
+    device_map="auto"
+)
 enhancer = pipeline(
+    'text-generation',
     model=model,
     tokenizer=tokenizer,
+    repetition_penalty=1.2,
 )
+T2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
+Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
+Start directly with the action, and keep descriptions literal and precise.
+Think like a cinematographer describing a shot list.
+Do not change the user input intent, just enhance it.
+Keep within 150 words.
+For best results, build your prompts using this structure:
+Start with main action in a single sentence
+Add specific details about movements and gestures
+Describe character/object appearances precisely
+Include background and environment details
+Specify camera angles and movements
+Describe lighting and colors
+Note any changes or sudden events
+Do not exceed the 150 word limit!
+Output the enhanced prompt only.
+"""
 @spaces.GPU
 def enhance_prompt(prompt):
+    messages = [
+        {"role": "system", "content": T2V_CINEMATIC_PROMPT},
+        {"role": "user", "content": f"user_prompt: {prompt}"},
+    ]
+    answer = enhancer(
+        messages,
+        max_new_tokens=256,
+        return_full_text=False,
+        pad_token_id=tokenizer.eos_token_id
+    )
     final_answer = answer[0]['generated_text']
+    return final_answer.strip()
 # --- Argument Parsing ---
 parser = argparse.ArgumentParser(description="Gradio Demo for Self-Forcing with Frame Streaming")