Keltezaa commited on
Commit
f27fc80
·
verified ·
1 Parent(s): 99ee3ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -28,6 +28,10 @@ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
28
  longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
29
  longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
30
 
 
 
 
 
31
  def preprocess_prompt(input_text, max_clip_tokens=77):
32
  """
33
  Preprocess the input prompt based on its length:
@@ -68,13 +72,16 @@ def process_summarized_input(input_text):
68
  return inputs
69
 
70
 
71
- # Chunking Function (Option 3)
72
- def split_prompt(prompt, chunk_size=77):
73
- """Splits a long prompt into chunks of the specified token size."""
74
  tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
75
- chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
 
 
 
76
  return chunks
77
-
 
 
78
 
79
  def process_clip_chunks(input_text):
80
  """
@@ -89,11 +96,6 @@ def process_clip_chunks(input_text):
89
  processed_chunks.append(inputs)
90
  return processed_chunks # Return processed chunks for downstream usage
91
 
92
-
93
- # Example usage
94
- input_text = "Your long prompt goes here..."
95
- inputs = preprocess_prompt(input_text)
96
-
97
  # Load prompts for randomization
98
  df = pd.read_csv('prompts.csv', header=None)
99
  prompt_values = df.values.flatten()
 
28
  longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
29
  longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
30
 
31
+ # Example usage
32
+ input_text = "Your long prompt goes here..."
33
+ inputs = preprocess_prompt(input_text)
34
+
35
  def preprocess_prompt(input_text, max_clip_tokens=77):
36
  """
37
  Preprocess the input prompt based on its length:
 
72
  return inputs
73
 
74
 
75
+ def split_prompt_with_overlap(prompt, chunk_size=77, overlap=10):
 
 
76
  tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
77
+ chunks = [
78
+ tokens[i:max(i + chunk_size, len(tokens))]
79
+ for i in range(0, len(tokens), chunk_size - overlap)
80
+ ]
81
  return chunks
82
+
83
+ chunks = split_prompt("Test " * 200)
84
+ assert all(len(chunk) <= 77 for chunk in chunks), "Chunk size exceeded"
85
 
86
  def process_clip_chunks(input_text):
87
  """
 
96
  processed_chunks.append(inputs)
97
  return processed_chunks # Return processed chunks for downstream usage
98
 
 
 
 
 
 
99
  # Load prompts for randomization
100
  df = pd.read_csv('prompts.csv', header=None)
101
  prompt_values = df.values.flatten()