Celebrity

Runtime error

App Files Files Community

Keltezaa commited on Nov 23, 2024

Commit

99ee3ff

verified ·

1 Parent(s): e3a6668

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -5

app.py CHANGED Viewed

@@ -28,16 +28,73 @@ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
 longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
 longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-def process_input(input_text):
-    # Tokenize and truncate input
-    inputs = clip_processor(text=input_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
     return inputs
 # Example usage
 input_text = "Your long prompt goes here..."
-inputs = process_input(input_text)
-#Load prompts for randomization
 df = pd.read_csv('prompts.csv', header=None)
 prompt_values = df.values.flatten()

 longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
 longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
+def preprocess_prompt(input_text, max_clip_tokens=77):
+    """
+    Preprocess the input prompt based on its length:
+    - If the prompt is <= max_clip_tokens, summarize it.
+    - If the prompt is > max_clip_tokens, split and process it.
+    """
+    # Tokenize the prompt to determine its token length
+    tokens = clip_processor.tokenizer(input_text, return_tensors="pt")["input_ids"][0]
+    token_count = len(tokens)
+    if token_count <= max_clip_tokens:
+        # Use summarization for shorter prompts
+        print("Using summarization (Option 5) as the prompt is short.")
+        return process_summarized_input(input_text)
+    else:
+        # Use split-and-process for longer prompts
+        print("Using chunking (Option 3) as the prompt exceeds 77 tokens.")
+        return process_clip_chunks(input_text)
+# Summarization Function (Option 5)
+def summarize_prompt(input_text, max_length=77):
+    """
+    Summarizes the input text to fit within the CLIP token limit.
+    Basic implementation uses the first `max_length` tokens.
+    """
+    summarized_text = " ".join(input_text.split()[:max_length])  # Simple summarization: First 77 words
+    print(f"Summarized prompt: {summarized_text}")
+    return summarized_text
+def process_summarized_input(input_text):
+    """
+    Prepares summarized text for CLIP processing.
+    """
+    summarized_text = summarize_prompt(input_text, max_length=77)
+    inputs = clip_processor(text=summarized_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
     return inputs
+# Chunking Function (Option 3)
+def split_prompt(prompt, chunk_size=77):
+    """Splits a long prompt into chunks of the specified token size."""
+    tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
+    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+    return chunks
+def process_clip_chunks(input_text):
+    """
+    Tokenizes and processes a long input text in chunks for CLIP.
+    Each chunk respects the model's 77-token limit.
+    """
+    chunks = split_prompt(input_text)
+    processed_chunks = []
+    for chunk in chunks:
+        chunk_text = clip_processor.tokenizer.decode(chunk, skip_special_tokens=True)
+        inputs = clip_processor(text=chunk_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
+        processed_chunks.append(inputs)
+    return processed_chunks  # Return processed chunks for downstream usage
 # Example usage
 input_text = "Your long prompt goes here..."
+inputs = preprocess_prompt(input_text)
+# Load prompts for randomization
 df = pd.read_csv('prompts.csv', header=None)
 prompt_values = df.values.flatten()