Celebrity

Running

App Files Files Community

Keltezaa commited on Nov 24, 2024

Commit

f36149d

verified ·

1 Parent(s): f27fc80

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -38

app.py CHANGED Viewed

@@ -19,39 +19,6 @@ import pandas as pd
 # Disable tokenizer parallelism
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Initialize the CLIP tokenizer and model
-clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
-# Initialize the Longformer tokenizer and model
-longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
-# Example usage
-input_text = "Your long prompt goes here..."
-inputs = preprocess_prompt(input_text)
-def preprocess_prompt(input_text, max_clip_tokens=77):
-    """
-    Preprocess the input prompt based on its length:
-    - If the prompt is <= max_clip_tokens, summarize it.
-    - If the prompt is > max_clip_tokens, split and process it.
-    """
-    # Tokenize the prompt to determine its token length
-    tokens = clip_processor.tokenizer(input_text, return_tensors="pt")["input_ids"][0]
-    token_count = len(tokens)
-    if token_count <= max_clip_tokens:
-        # Use summarization for shorter prompts
-        print("Using summarization (Option 5) as the prompt is short.")
-        return process_summarized_input(input_text)
-    else:
-        # Use split-and-process for longer prompts
-        print("Using chunking (Option 3) as the prompt exceeds 77 tokens.")
-        return process_clip_chunks(input_text)
 # Summarization Function (Option 5)
 def summarize_prompt(input_text, max_length=77):
     """
@@ -62,7 +29,6 @@ def summarize_prompt(input_text, max_length=77):
     print(f"Summarized prompt: {summarized_text}")
     return summarized_text
 def process_summarized_input(input_text):
     """
     Prepares summarized text for CLIP processing.
@@ -71,7 +37,6 @@ def process_summarized_input(input_text):
     inputs = clip_processor(text=summarized_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
     return inputs
 def split_prompt_with_overlap(prompt, chunk_size=77, overlap=10):
     tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
     chunks = [
@@ -79,9 +44,12 @@ def split_prompt_with_overlap(prompt, chunk_size=77, overlap=10):
         for i in range(0, len(tokens), chunk_size - overlap)
     ]
     return chunks
-chunks = split_prompt("Test " * 200)
-assert all(len(chunk) <= 77 for chunk in chunks), "Chunk size exceeded"
 def process_clip_chunks(input_text):
     """
@@ -96,6 +64,38 @@ def process_clip_chunks(input_text):
         processed_chunks.append(inputs)
     return processed_chunks  # Return processed chunks for downstream usage
 # Load prompts for randomization
 df = pd.read_csv('prompts.csv', header=None)
 prompt_values = df.values.flatten()

 # Disable tokenizer parallelism
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Summarization Function (Option 5)
 def summarize_prompt(input_text, max_length=77):
     """
     print(f"Summarized prompt: {summarized_text}")
     return summarized_text
 def process_summarized_input(input_text):
     """
     Prepares summarized text for CLIP processing.
     inputs = clip_processor(text=summarized_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
     return inputs
 def split_prompt_with_overlap(prompt, chunk_size=77, overlap=10):
     tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
     chunks = [
         for i in range(0, len(tokens), chunk_size - overlap)
     ]
     return chunks
+def split_prompt(prompt, chunk_size=77):
+    """Splits a long prompt into chunks of the specified token size."""
+    tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
+    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
+    return chunks
 def process_clip_chunks(input_text):
     """
         processed_chunks.append(inputs)
     return processed_chunks  # Return processed chunks for downstream usage
+def preprocess_prompt(input_text, max_clip_tokens=77):
+    """
+    Preprocess the input prompt based on its length:
+    - If the prompt is <= max_clip_tokens, summarize it.
+    - If the prompt is > max_clip_tokens, split and process it.
+    """
+    # Tokenize the prompt to determine its token length
+    tokens = clip_processor.tokenizer(input_text, return_tensors="pt")["input_ids"][0]
+    token_count = len(tokens)
+    if token_count <= max_clip_tokens:
+        # Use summarization for shorter prompts
+        print("Using summarization (Option 5) as the prompt is short.")
+        return process_summarized_input(input_text)
+    else:
+        # Use split-and-process for longer prompts
+        print("Using chunking (Option 3) as the prompt exceeds 77 tokens.")
+        return process_clip_chunks(input_text)
+# Initialize the CLIP tokenizer and model
+clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
+# Initialize the Longformer tokenizer and model
+longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
+# Example usage
+input_text = "Your long prompt goes here..."
+inputs = preprocess_prompt(input_text)
 # Load prompts for randomization
 df = pd.read_csv('prompts.csv', header=None)
 prompt_values = df.values.flatten()