Keltezaa commited on
Commit
99ee3ff
·
verified ·
1 Parent(s): e3a6668

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -5
app.py CHANGED
@@ -28,16 +28,73 @@ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
28
  longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
29
  longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
30
 
31
- def process_input(input_text):
32
- # Tokenize and truncate input
33
- inputs = clip_processor(text=input_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  return inputs
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # Example usage
37
  input_text = "Your long prompt goes here..."
38
- inputs = process_input(input_text)
39
 
40
- #Load prompts for randomization
41
  df = pd.read_csv('prompts.csv', header=None)
42
  prompt_values = df.values.flatten()
43
 
 
28
  longformer_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
29
  longformer_model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
30
 
31
+ def preprocess_prompt(input_text, max_clip_tokens=77):
32
+ """
33
+ Preprocess the input prompt based on its length:
34
+ - If the prompt is <= max_clip_tokens, summarize it.
35
+ - If the prompt is > max_clip_tokens, split and process it.
36
+ """
37
+ # Tokenize the prompt to determine its token length
38
+ tokens = clip_processor.tokenizer(input_text, return_tensors="pt")["input_ids"][0]
39
+ token_count = len(tokens)
40
+
41
+ if token_count <= max_clip_tokens:
42
+ # Use summarization for shorter prompts
43
+ print("Using summarization (Option 5) as the prompt is short.")
44
+ return process_summarized_input(input_text)
45
+ else:
46
+ # Use split-and-process for longer prompts
47
+ print("Using chunking (Option 3) as the prompt exceeds 77 tokens.")
48
+ return process_clip_chunks(input_text)
49
+
50
+
51
+ # Summarization Function (Option 5)
52
+ def summarize_prompt(input_text, max_length=77):
53
+ """
54
+ Summarizes the input text to fit within the CLIP token limit.
55
+ Basic implementation uses the first `max_length` tokens.
56
+ """
57
+ summarized_text = " ".join(input_text.split()[:max_length]) # Simple summarization: First 77 words
58
+ print(f"Summarized prompt: {summarized_text}")
59
+ return summarized_text
60
+
61
+
62
+ def process_summarized_input(input_text):
63
+ """
64
+ Prepares summarized text for CLIP processing.
65
+ """
66
+ summarized_text = summarize_prompt(input_text, max_length=77)
67
+ inputs = clip_processor(text=summarized_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
68
  return inputs
69
 
70
+
71
+ # Chunking Function (Option 3)
72
+ def split_prompt(prompt, chunk_size=77):
73
+ """Splits a long prompt into chunks of the specified token size."""
74
+ tokens = clip_processor.tokenizer(prompt, return_tensors="pt")["input_ids"][0]
75
+ chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
76
+ return chunks
77
+
78
+
79
+ def process_clip_chunks(input_text):
80
+ """
81
+ Tokenizes and processes a long input text in chunks for CLIP.
82
+ Each chunk respects the model's 77-token limit.
83
+ """
84
+ chunks = split_prompt(input_text)
85
+ processed_chunks = []
86
+ for chunk in chunks:
87
+ chunk_text = clip_processor.tokenizer.decode(chunk, skip_special_tokens=True)
88
+ inputs = clip_processor(text=chunk_text, return_tensors="pt", padding=True, truncation=True, max_length=77)
89
+ processed_chunks.append(inputs)
90
+ return processed_chunks # Return processed chunks for downstream usage
91
+
92
+
93
  # Example usage
94
  input_text = "Your long prompt goes here..."
95
+ inputs = preprocess_prompt(input_text)
96
 
97
+ # Load prompts for randomization
98
  df = pd.read_csv('prompts.csv', header=None)
99
  prompt_values = df.values.flatten()
100