asaf1602 commited on
Commit
b8397a5
·
verified ·
1 Parent(s): c29bac5

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. README.md +6 -7
  2. app.py +267 -381
  3. data/prompt.txt +22 -0
  4. data/slogan.csv +0 -0
  5. logic/cleaning.py +96 -0
  6. logic/search.py +45 -0
  7. requirements.txt +8 -7
  8. runtime.txt +1 -0
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: SloganAI
3
- emoji: 🏢
4
- colorFrom: pink
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: Startup recommender with AI-generated slogans
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Slogan Finder
3
+ emoji: 🏷️
4
+ colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: "5.43.1"
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # Slogan Finder
13
+ Search *real slogans* (SBERT + FAISS) and get *1 AI-generated* suggestion.
app.py CHANGED
@@ -1,391 +1,277 @@
1
-
2
- import os, re, json
3
- import numpy as np
4
- import pandas as pd
5
  import gradio as gr
6
  import faiss
7
- import torch
8
- from typing import List
9
- from sentence_transformers import SentenceTransformer, CrossEncoder
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
 
12
- # =========================
13
- # Global Config
14
- # =========================
15
- # מודלים (אותו סטינג כמו במחברת; יש Fallback ל-base אם ה-Large לא נכנס לזיכרון)
16
- FLAN_PRIMARY = os.getenv("FLAN_PRIMARY", "google/flan-t5-large")
17
- FLAN_FALLBACK = "google/flan-t5-base"
18
- EMBED_NAME = "sentence-transformers/all-mpnet-base-v2"
19
- RERANK_NAME = "cross-encoder/stsb-roberta-base"
20
-
21
- NUM_SLOGAN_SAMPLES = int(os.getenv("NUM_SLOGAN_SAMPLES", "16")) # אפשר להעלות ל-32 אם יש GPU
22
- INDEX_ROOT = os.path.join(os.path.dirname(__file__), "vector_store") # איפה ששמנו את האינדקסים
23
- DEFAULT_MODEL_FOR_INDEX = EMBED_NAME
24
-
25
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
-
27
- # =========================
28
- # Lazy model loading (first call only)
29
- # =========================
30
- _GEN_TOK = None
31
- _GEN_MODEL = None
32
- _EMBED_MODEL = None
33
- _RERANKER = None
34
-
35
- def _ensure_models():
36
- global _GEN_TOK, _GEN_MODEL, _EMBED_MODEL, _RERANKER
37
- if _EMBED_MODEL is None:
38
- _EMBED_MODEL = SentenceTransformer(EMBED_NAME)
39
- if _RERANKER is None:
40
- _RERANKER = CrossEncoder(RERANK_NAME)
41
-
42
- if _GEN_MODEL is None:
43
- try:
44
- tok = AutoTokenizer.from_pretrained(FLAN_PRIMARY)
45
- mdl = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PRIMARY)
46
- _GEN_TOK, _GEN_MODEL = tok, mdl.to(DEVICE)
47
- print(f"[INFO] Loaded generator: {FLAN_PRIMARY}")
48
- except Exception as e:
49
- print(f"[WARN] Failed to load {FLAN_PRIMARY}. Falling back to {FLAN_FALLBACK}. Error: {e}")
50
- tok = AutoTokenizer.from_pretrained(FLAN_FALLBACK)
51
- mdl = AutoModelForSeq2SeqLM.from_pretrained(FLAN_FALLBACK)
52
- _GEN_TOK, _GEN_MODEL = tok, mdl.to(DEVICE)
53
- print(f"[INFO] Loaded generator: {FLAN_FALLBACK}")
54
-
55
- # =========================
56
- # Index cache (so we don't read multiple times)
57
- # =========================
58
- _INDEX_CACHE = {} # model_key -> (faiss_index, meta_df)
59
-
60
- def _model_key(name: str) -> str:
61
- return name.replace("/", "_")
62
-
63
- def _format_for_e5(texts, as_query=False):
64
- prefix = "query: " if as_query else "passage: "
65
- return [prefix + str(t) for t in texts]
66
-
67
- def _load_index_for_model(model_name: str = DEFAULT_MODEL_FOR_INDEX):
68
- """Load FAISS index + meta once for a given model."""
69
- mkey = _model_key(model_name)
70
- if mkey in _INDEX_CACHE:
71
- return _INDEX_CACHE[mkey]
72
-
73
- base = os.path.join(INDEX_ROOT, mkey)
74
- idx_path = os.path.join(base, "index.faiss")
75
- meta_path = os.path.join(base, "meta.parquet")
76
-
77
- if not (os.path.exists(idx_path) and os.path.exists(meta_path)):
78
- # fallback: tiny demo index (3 rows) if user didn't push vector_store
79
- print(f"[WARN] Missing index for {model_name}. Using tiny demo in-memory index.")
80
- demo = pd.DataFrame({
81
- "name": ["HowDidIDo", "Museotainment", "Movitr"],
82
- "tagline": ["Online evaluation platform", "PacMan & Louvre meet", "Crowdsourced video translation"],
83
- "description": [
84
- "Public speaking, Presentation skills and interview practice",
85
- "Interactive AR museum tours",
86
- "Video translation with voice and subtitles"
87
- ]
88
- })
89
- model = SentenceTransformer(model_name)
90
- vecs = model.encode(demo["description"].tolist(), normalize_embeddings=True)
91
- dim = vecs.shape[1]
92
- index = faiss.IndexFlatIP(dim)
93
- index.add(np.asarray(vecs, dtype=np.float32))
94
- _INDEX_CACHE[mkey] = (index, demo)
95
- return _INDEX_CACHE[mkey]
96
-
97
- index = faiss.read_index(idx_path)
98
- meta_df = pd.read_parquet(meta_path)
99
- _INDEX_CACHE[mkey] = (index, meta_df)
100
- return _INDEX_CACHE[mkey]
101
-
102
- # =========================
103
- # Recommendation (top-3) using FAISS index you generated
104
- # =========================
105
- def recommend(query_text: str, model_name: str = DEFAULT_MODEL_FOR_INDEX, top_k: int = 3) -> pd.DataFrame:
106
- _ensure_models()
107
- index, meta = _load_index_for_model(model_name)
108
-
109
- # format for E5 if needed
110
- if model_name.startswith("intfloat/e5"):
111
- q_inp = _format_for_e5([query_text], as_query=True)
112
  else:
113
- q_inp = [query_text]
114
-
115
- q_vec = _EMBED_MODEL.encode(q_inp, normalize_embeddings=True)
116
- q_vec = np.asarray(q_vec, dtype=np.float32)
117
- scores, idxs = index.search(q_vec, top_k)
118
- scores, idxs = scores[0], idxs[0]
119
- out = meta.iloc[idxs].copy()
120
- out["score"] = scores
121
- # make sure columns exist in output (name, tagline, description)
122
- cols = [c for c in ["row_id","name","tagline","description","score"] if c in out.columns or c=="score"]
123
- return out[cols] if "score" in out.columns else out
124
-
125
- # =========================
126
- # Advanced Slogan Generator (your Refined v2 logic)
127
- # =========================
128
- BLOCK_PATTERNS = [
129
- r"^[A-Z][a-z]+ [A-Z][a-z]+ (Platform|Solution|System|Application|Marketplace)$",
130
- r"^[A-Z][a-z]+ [A-Z][a-z]+$",
131
- r"^[A-Z][a-z]+$",
132
- ]
133
- HARD_BLOCK_WORDS = {
134
- "platform","solution","system","application","marketplace",
135
- "ai-powered","ai powered","empower","empowering",
136
- "artificial intelligence","machine learning","augmented reality","virtual reality",
137
- }
138
- GENERIC_WORDS = {"app","assistant","smart","ai","ml","ar","vr","decentralized","blockchain"}
139
- MARKETING_VERBS = {"build","grow","simplify","discover","create","connect","transform","unlock","boost","learn","move","clarify"}
140
- BENEFIT_WORDS = {"faster","smarter","easier","better","safer","clearer","stronger","together","confidently","simply","instantly"}
141
- GOOD_SLOGANS_TO_AVOID_DUP = {
142
- "smarter care, faster decisions",
143
- "checkout built for small brands",
144
- "less guessing. more healing.",
145
- "built to grow with your cart.",
146
- "stand tall. feel better.",
147
- "train your brain to win.",
148
- "your body. your algorithm.",
149
- "play smarter. grow brighter.",
150
- "style that thinks with you."
151
- }
152
-
153
- def _tokens(s: str) -> List[str]:
154
- return re.findall(r"[a-z0-9]{3,}", s.lower())
155
-
156
- def _jaccard(a: List[str], b: List[str]) -> float:
157
- A, B = set(a), set(b)
158
- return 0.0 if not A or not B else len(A & B) / len(A | B)
159
-
160
- def _titlecase_soft(s: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  out = []
162
- for w in s.split():
163
- out.append(w if w.isupper() else w.capitalize())
 
 
164
  return " ".join(out)
165
 
166
- def _is_blocked_slogan(s: str) -> bool:
167
- if not s: return True
168
- s_strip = s.strip()
169
- for pat in BLOCK_PATTERNS:
170
- if re.match(pat, s_strip):
171
- return True
172
- s_low = s_strip.lower()
173
- for w in HARD_BLOCK_WORDS:
174
- if w in s_low:
175
- return True
176
- if s_low in GOOD_SLOGANS_TO_AVOID_DUP:
177
- return True
178
- return False
179
-
180
- def _generic_penalty(s: str) -> float:
181
- hits = sum(1 for w in GENERIC_WORDS if w in s.lower())
182
- return min(1.0, 0.25 * hits)
183
-
184
- def _for_penalty(s: str) -> float:
185
- return 0.3 if re.search(r"\bfor\b", s.lower()) else 0.0
186
-
187
- def _neighbor_context(neighbors_df: pd.DataFrame) -> str:
188
- if neighbors_df is None or neighbors_df.empty:
189
- return ""
190
- examples = []
191
- for _, row in neighbors_df.head(3).iterrows():
192
- tg = str(row.get("tagline", "")).strip()
193
- if 5 <= len(tg) <= 70:
194
- examples.append(f"- {tg}")
195
- return "\n".join(examples)
196
-
197
- def _copies_neighbor(s: str, neighbors_df: pd.DataFrame) -> bool:
198
- if neighbors_df is None or neighbors_df.empty:
199
- return False
200
- s_low = s.lower()
201
- s_toks = _tokens(s_low)
202
- for _, row in neighbors_df.iterrows():
203
- t = str(row.get("tagline", "")).strip()
204
- if not t:
205
- continue
206
- t_low = t.lower()
207
- if s_low == t_low:
208
- return True
209
- if _jaccard(s_toks, _tokens(t_low)) >= 0.7:
210
- return True
211
- try:
212
- s_vec = _EMBED_MODEL.encode([s])[0]; s_vec = s_vec / np.linalg.norm(s_vec)
213
- for _, row in neighbors_df.head(3).iterrows():
214
- t = str(row.get("tagline", "")).strip()
215
- if not t: continue
216
- t_vec = _EMBED_MODEL.encode([t])[0]; t_vec = t_vec / np.linalg.norm(t_vec)
217
- if float(np.dot(s_vec, t_vec)) >= 0.85:
218
- return True
219
- except Exception:
220
- pass
221
- return False
222
-
223
- def _clean_slogan(text: str, max_words: int = 8) -> str:
224
- text = text.strip().split("\n")[0]
225
- text = re.sub(r"[\"“”‘’]", "", text)
226
- text = re.sub(r"\s+", " ", text).strip()
227
- text = re.sub(r"^\W+|\W+$", "", text)
228
- words = text.split()
229
- if len(words) > max_words:
230
- text = " ".join(words[:max_words])
231
- return text
232
-
233
- def _score_candidates(query: str, cands: List[str], neighbors_df: pd.DataFrame) -> List[tuple]:
234
- if not cands:
235
- return []
236
- ce_scores = np.asarray(_RERANKER.predict([(query, s) for s in cands]), dtype=np.float32) / 5.0
237
- q_toks = _tokens(query)
238
- results = []
239
-
240
- neighbor_vecs = []
241
- if neighbors_df is not None and not neighbors_df.empty:
242
- for _, row in neighbors_df.head(3).iterrows():
243
- t = str(row.get("tagline","")).strip()
244
- if t:
245
- v = _EMBED_MODEL.encode([t])[0]
246
- neighbor_vecs.append(v / np.linalg.norm(v))
247
-
248
- for i, s in enumerate(cands):
249
- words = s.split()
250
- brevity = 1.0 - min(1.0, abs(len(words) - 5) / 5.0) # best ~5 words
251
- wl = set(w.lower() for w in words)
252
- m_hits = len(wl & MARKETING_VERBS)
253
- b_hits = len(wl & BENEFIT_WORDS)
254
- marketing = min(1.0, 0.2*m_hits + 0.2*b_hits)
255
- g_pen = _generic_penalty(s)
256
- f_pen = _for_penalty(s)
257
-
258
- n_pen = 0.0
259
- if neighbor_vecs:
260
- try:
261
- s_vec = _EMBED_MODEL.encode([s])[0]; s_vec = s_vec / np.linalg.norm(s_vec)
262
- sim_max = max(float(np.dot(s_vec, nv)) for nv in neighbor_vecs) if neighbor_vecs else 0.0
263
- n_pen = sim_max
264
- except Exception:
265
- n_pen = 0.0
266
-
267
- overlap = _jaccard(q_toks, _tokens(s))
268
- anti_copy = 1.0 - overlap
269
-
270
- score = (
271
- 0.55*float(ce_scores[i]) +
272
- 0.20*brevity +
273
- 0.15*marketing +
274
- 0.03*anti_copy -
275
- 0.07*g_pen -
276
- 0.03*f_pen -
277
- 0.10*n_pen
278
- )
279
- results.append((s, float(score)))
280
- return results
281
-
282
- def generate_slogan(query_text: str, neighbors_df: pd.DataFrame = None, n_samples: int = NUM_SLOGAN_SAMPLES) -> str:
283
- _ensure_models()
284
- ctx = _neighbor_context(neighbors_df)
285
- prompt = (
286
- "You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).\n"
287
- "Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.\n"
288
- "Focus on clear benefits and vivid verbs. Do not copy the description. Return ONLY a list, one slogan per line.\n\n"
289
- "Good Examples:\n"
290
- "Description: AI assistant for doctors to prioritize patient cases\n"
291
- "Slogan: Less Guessing. More Healing.\n\n"
292
- "Description: Payments for small online stores\n"
293
- "Slogan: Built to Grow with Your Cart.\n\n"
294
- "Description: Neurotech headset to boost focus\n"
295
- "Slogan: Train Your Brain to Win.\n\n"
296
- "Description: Interior design suggestions with AI\n"
297
- "Slogan: Style That Thinks With You.\n\n"
298
- "Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses\n\n"
299
- )
300
- if ctx:
301
- prompt += f"Similar taglines (style only):\n{ctx}\n\n"
302
- prompt += f"Description: {query_text}\nSlogans:"
303
-
304
- input_ids = _GEN_TOK(prompt, return_tensors="pt").input_ids.to(DEVICE)
305
- outputs = _GEN_MODEL.generate(
306
- input_ids,
307
- max_new_tokens=24,
308
  do_sample=True,
309
- top_k=60,
310
- top_p=0.92,
311
- temperature=1.2,
312
- num_return_sequences=n_samples,
313
- repetition_penalty=1.08
 
 
 
314
  )
315
- raw_cands = [_GEN_TOK.decode(o, skip_special_tokens=True) for o in outputs]
316
-
317
- cand_set = set()
318
- for txt in raw_cands:
319
- for line in txt.split("\n"):
320
- s = _clean_slogan(line)
321
- if not s:
322
- continue
323
- if len(s.split()) < 2 or len(s.split()) > 8:
324
- continue
325
- if _is_blocked_slogan(s):
326
- continue
327
- if _copies_neighbor(s, neighbors_df):
328
- continue
329
- cand_set.add(_titlecase_soft(s))
330
-
331
- if not cand_set:
332
- return _clean_slogan(_GEN_TOK.decode(outputs[0], skip_special_tokens=True))
333
-
334
- scored = _score_candidates(query_text, sorted(cand_set), neighbors_df)
335
- if not scored:
336
- return _clean_slogan(_GEN_TOK.decode(outputs[0], skip_special_tokens=True))
337
-
338
- scored.sort(key=lambda x: x[1], reverse=True)
339
- return scored[0][0]
340
-
341
- # =========================
342
- # Gradio Pipeline
343
- # =========================
344
- EXAMPLES = [
345
- "AI coach for improving public speaking skills",
346
- "Augmented reality app for interactive museum tours",
347
- "Voice-controlled task manager for remote teams",
348
- "Machine learning system for predicting crop yields",
349
- "Platform for AI-assisted interior design suggestions",
350
- ]
351
-
352
- def pipeline(user_input: str):
353
- # 1) Top-3 recommendations from your FAISS index (mpnet by default)
354
- recs = recommend(user_input, model_name=DEFAULT_MODEL_FOR_INDEX, top_k=3)
355
-
356
- # 2) Generate slogan using the neighbors as style context
357
- slogan = generate_slogan(user_input, neighbors_df=recs, n_samples=NUM_SLOGAN_SAMPLES)
358
-
359
- # 3) Append the generated item as the 4th row
360
- recs = recs.reset_index(drop=True)
361
- # Ensure columns exist
362
- if "name" not in recs.columns: recs["name"] = ""
363
- if "tagline" not in recs.columns: recs["tagline"] = ""
364
- if "description" not in recs.columns: recs["description"] = ""
365
-
366
- recs.loc[len(recs)] = {
367
- "row_id": np.nan,
368
- "name": "Synthetic Example",
369
- "tagline": slogan,
370
- "description": user_input,
371
- "score": np.nan
372
- }
373
- # Second output: the slogan itself (visible headline)
374
- return recs[["name","tagline","description","score"]], slogan
375
-
376
- with gr.Blocks(title="SloganAI — Recommendations + Slogan Generator") as demo:
377
- gr.Markdown("## SloganAI — Top-3 Recommendations + A High-Quality Generated Slogan\nEnter a startup idea, click **Submit**, or try an example.")
378
- with gr.Row():
379
- with gr.Column(scale=1):
380
- inp = gr.Textbox(label="Enter a startup description", lines=3, placeholder="e.g., AI coach for improving public speaking skills")
381
- ex = gr.Examples(EXAMPLES, inputs=inp, label="One‑click examples")
382
- btn = gr.Button("Submit", variant="primary")
383
- with gr.Column(scale=2):
384
- out_df = gr.Dataframe(headers=["Name","Tagline","Description","Score"], label="Top 3 + Generated")
385
- out_sg = gr.Textbox(label="Generated Slogan", interactive=False)
386
-
387
- btn.click(fn=pipeline, inputs=inp, outputs=[out_df, out_sg])
388
-
389
- if __name__ == "__main__":
390
- _ensure_models()
391
- demo.queue().launch()
 
1
+ \
2
+ import os, json, numpy as np, pandas as pd
 
 
3
  import gradio as gr
4
  import faiss
5
+ import re
6
+ from sentence_transformers import SentenceTransformer
 
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
 
9
+ from logic.cleaning import clean_dataframe
10
+ from logic.search import SloganSearcher
11
+
12
+ # -------------------- Config --------------------
13
+ ASSETS_DIR = "assets"
14
+ DATA_PATH = "data/slogan.csv"
15
+ PROMPT_PATH = "data/prompt.txt"
16
+
17
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
18
+ NORMALIZE = True
19
+
20
+ GEN_MODEL = "google/flan-t5-base"
21
+ NUM_GEN_CANDIDATES = 12
22
+ MAX_NEW_TOKENS = 18
23
+ TEMPERATURE = 0.7
24
+ TOP_P = 0.9
25
+ REPETITION_PENALTY = 1.15
26
+
27
+ # choose the most relevant yet non-duplicate candidate
28
+ RELEVANCE_WEIGHT = 0.7
29
+ NOVELTY_WEIGHT = 0.3
30
+ DUPLICATE_MAX_SIM = 0.92
31
+ NOVELTY_SIM_THRESHOLD = 0.80 # keep some distance from retrieved
32
+
33
+ META_PATH = os.path.join(ASSETS_DIR, "meta.json")
34
+ PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
35
+ INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
36
+ EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
37
+
38
+ def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
39
+
40
+ # -------------------- Asset build --------------------
41
+ def _build_assets():
42
+ if not os.path.exists(DATA_PATH):
43
+ raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
44
+ os.makedirs(ASSETS_DIR, exist_ok=True)
45
+
46
+ _log(f"Loading dataset: {DATA_PATH}")
47
+ df = pd.read_csv(DATA_PATH)
48
+
49
+ _log(f"Rows before cleaning: {len(df)}")
50
+ df = clean_dataframe(df)
51
+ _log(f"Rows after cleaning: {len(df)}")
52
+
53
+ if "description" in df.columns and df["description"].notna().any():
54
+ texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
55
+ text_col, fallback_col = "description", "tagline"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  else:
57
+ texts = df["tagline"].astype(str).tolist()
58
+ text_col, fallback_col = "tagline", "tagline"
59
+
60
+ _log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
61
+ encoder = SentenceTransformer(MODEL_NAME)
62
+ emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)
63
+
64
+ dim = emb.shape[1]
65
+ index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
66
+ index.add(emb)
67
+
68
+ _log("Persisting assets …")
69
+ df.to_parquet(PARQUET_PATH, index=False)
70
+ faiss.write_index(index, INDEX_PATH)
71
+ np.save(EMB_PATH, emb)
72
+
73
+ meta = {
74
+ "model_name": MODEL_NAME,
75
+ "dim": int(dim),
76
+ "normalized": NORMALIZE,
77
+ "metric": "ip" if NORMALIZE else "l2",
78
+ "row_count": int(len(df)),
79
+ "text_col": text_col,
80
+ "fallback_col": fallback_col,
81
+ }
82
+ with open(META_PATH, "w") as f:
83
+ json.dump(meta, f, indent=2)
84
+ _log("Assets built successfully.")
85
+
86
+ def _ensure_assets():
87
+ need = False
88
+ for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
89
+ if not os.path.exists(p):
90
+ _log(f"Missing asset: {p}")
91
+ need = True
92
+ if need:
93
+ _log("Building assets from scratch …")
94
+ _build_assets()
95
+ return
96
+ try:
97
+ pd.read_parquet(PARQUET_PATH)
98
+ except Exception as e:
99
+ _log(f"Parquet read failed ({e}); rebuilding assets.")
100
+ _build_assets()
101
+
102
+ # Build before UI
103
+ _ensure_assets()
104
+
105
+ # -------------------- Retrieval --------------------
106
+ searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
107
+ meta = json.load(open(META_PATH))
108
+ _encoder = SentenceTransformer(meta["model_name"])
109
+
110
+ # -------------------- Generator --------------------
111
+ _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
112
+ _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)
113
+
114
+ # keep this list small so we don't nuke relevant outputs
115
+ _BANNED_TERMS = {"portal", "e-commerce", "ecommerce", "shopping", "shop"}
116
+ _PUNCT_CHARS = ":;—–-,.!?“”\"'`"
117
+ _PUNCT_RE = re.compile(f"[{re.escape(_PUNCT_CHARS)}]")
118
+
119
+ _MIN_WORDS, _MAX_WORDS = 2, 8
120
+
121
+ def _load_prompt():
122
+ if os.path.exists(PROMPT_PATH):
123
+ with open(PROMPT_PATH, "r", encoding="utf-8") as f:
124
+ return f.read()
125
+ return (
126
+ "You are a professional slogan writer.\n"
127
+ "Write ONE original startup slogan under 8 words, Title Case, no punctuation.\n"
128
+ "Do not copy examples.\n"
129
+ "Description:\n{description}\nSlogan:"
130
+ )
131
+
132
+ def _render_prompt(description: str, retrieved=None) -> str:
133
+ tmpl = _load_prompt()
134
+ if "{description}" in tmpl:
135
+ prompt = tmpl.replace("{description}", description)
136
+ else:
137
+ prompt = f"{tmpl}\n\nDescription:\n{description}\nSlogan:"
138
+ if retrieved:
139
+ prompt += "\n\nDo NOT copy these existing slogans:\n"
140
+ for s in retrieved[:3]:
141
+ prompt += f"- {s}\n"
142
+ return prompt
143
+
144
+ def _title_case(s: str) -> str:
145
+ small = {"and","or","for","of","the","to","in","on","with","a","an"}
146
+ words = [w for w in s.split() if w]
147
  out = []
148
+ for i,w in enumerate(words):
149
+ lw = w.lower()
150
+ if i>0 and lw in small: out.append(lw)
151
+ else: out.append(lw.capitalize())
152
  return " ".join(out)
153
 
154
+ def _strip_punct(s: str) -> str:
155
+ return _PUNCT_RE.sub("", s)
156
+
157
+ def _strict_ok(s: str) -> bool:
158
+ if not s: return False
159
+ wc = len(s.split())
160
+ if wc < _MIN_WORDS or wc > _MAX_WORDS: return False
161
+ lo = s.lower()
162
+ if any(term in lo for term in _BANNED_TERMS): return False
163
+ if lo in {"the","a","an"}: return False
164
+ return True
165
+
166
+ def _postprocess_strict(texts):
167
+ cleaned, seen = [], set()
168
+ for t in texts:
169
+ s = t.replace("Slogan:", "").strip().strip('"').strip("'")
170
+ s = " ".join(s.split())
171
+ s = _strip_punct(s) # remove punctuation instead of rejecting
172
+ s = _title_case(s)
173
+ if _strict_ok(s):
174
+ k = s.lower()
175
+ if k not in seen:
176
+ seen.add(k); cleaned.append(s)
177
+ return cleaned
178
+
179
+ def _postprocess_relaxed(texts):
180
+ # fallback if strict returns nothing: keep 2–8 words, strip punctuation, Title Case
181
+ cleaned, seen = [], set()
182
+ for t in texts:
183
+ s = t.strip().strip('"').strip("'")
184
+ s = _strip_punct(s)
185
+ s = " ".join(s.split())
186
+ wc = len(s.split())
187
+ if _MIN_WORDS <= wc <= _MAX_WORDS:
188
+ s = _title_case(s)
189
+ k = s.lower()
190
+ if k not in seen:
191
+ seen.add(k); cleaned.append(s)
192
+ return cleaned
193
+
194
+ def _generate_candidates(description: str, retrieved_texts, n: int = NUM_GEN_CANDIDATES):
195
+ prompt = _render_prompt(description, retrieved_texts)
196
+
197
+ # only block very generic junk at decode time
198
+ bad_ids = _gen_tokenizer(list(_BANNED_TERMS), add_special_tokens=False).input_ids
199
+
200
+ inputs = _gen_tokenizer([prompt], return_tensors="pt", padding=True, truncation=True)
201
+ outputs = _gen_model.generate(
202
+ **inputs,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  do_sample=True,
204
+ temperature=TEMPERATURE,
205
+ top_p=TOP_P,
206
+ num_return_sequences=n,
207
+ max_new_tokens=MAX_NEW_TOKENS,
208
+ no_repeat_ngram_size=3,
209
+ repetition_penalty=REPETITION_PENALTY,
210
+ bad_words_ids=bad_ids if bad_ids else None,
211
+ eos_token_id=_gen_tokenizer.eos_token_id,
212
  )
213
+ texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
214
+
215
+ cands = _postprocess_strict(texts)
216
+ if not cands:
217
+ cands = _postprocess_relaxed(texts) # <- graceful fallback
218
+ return cands
219
+
220
+ def _pick_best(candidates, retrieved_texts, description):
221
+ """Weighted relevance to description minus duplication vs retrieved."""
222
+ if not candidates:
223
+ return None
224
+ c_emb = _encoder.encode(candidates, convert_to_numpy=True, normalize_embeddings=True)
225
+ d_emb = _encoder.encode([description], convert_to_numpy=True, normalize_embeddings=True)[0]
226
+ rel = c_emb @ d_emb # cosine sim to description
227
+
228
+ if retrieved_texts:
229
+ R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True)
230
+ dup = np.max(R @ c_emb.T, axis=0) # max sim to any retrieved
231
+ else:
232
+ dup = np.zeros(len(candidates), dtype=np.float32)
233
+
234
+ # penalize near-duplicates outright
235
+ mask = dup < DUPLICATE_MAX_SIM
236
+ if mask.any():
237
+ scores = RELEVANCE_WEIGHT * rel[mask] - NOVELTY_WEIGHT * dup[mask]
238
+ best_idx = np.argmax(scores)
239
+ return [c for i, c in enumerate(candidates) if mask[i]][best_idx]
240
+
241
+ # else: pick most relevant that still clears a basic novelty bar, else top score
242
+ scores = RELEVANCE_WEIGHT * rel - NOVELTY_WEIGHT * dup
243
+ order = np.argsort(-scores)
244
+ for i in order:
245
+ if dup[i] < NOVELTY_SIM_THRESHOLD:
246
+ return candidates[i]
247
+ return candidates[order[0]]
248
+
249
+ # -------------------- Inference pipeline --------------------
250
+ def run_pipeline(user_description: str):
251
+ if not user_description or not user_description.strip():
252
+ return "Please enter a description."
253
+ retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
254
+ retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
255
+ gens = _generate_candidates(user_description, retrieved_texts, NUM_GEN_CANDIDATES)
256
+ chosen = _pick_best(gens, retrieved_texts, user_description) or (gens[0] if gens else "—")
257
+ lines = []
258
+ lines.append("### 🔎 Top 3 similar slogans")
259
+ if retrieved_texts:
260
+ for i, s in enumerate(retrieved_texts, 1):
261
+ lines.append(f"{i}. {s}")
262
+ else:
263
+ lines.append("No similar slogans found.")
264
+ lines.append("\n### AI-generated suggestion")
265
+ lines.append(chosen)
266
+ return "\n".join(lines)
267
+
268
+ # -------------------- UI --------------------
269
+ with gr.Blocks(title="Slogan Finder") as demo:
270
+ gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
271
+ query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
272
+ btn = gr.Button("Get slogans", variant="primary")
273
+ out = gr.Markdown()
274
+ btn.click(run_pipeline, inputs=[query], outputs=out)
275
+
276
+ demo.queue(max_size=64).launch()
277
+
 
 
 
 
 
 
 
 
 
 
 
 
data/prompt.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).
2
+ Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.
3
+ Focus on clear benefits and vivid verbs. Do not copy the description. Return ONLY a list, one slogan per line.
4
+
5
+ Good Examples:
6
+ Description: AI assistant for doctors to prioritize patient cases
7
+ Slogan: Less Guessing. More Healing.
8
+
9
+ Description: Payments for small online stores
10
+ Slogan: Built to Grow with Your Cart.
11
+
12
+ Description: Neurotech headset to boost focus
13
+ Slogan: Train Your Brain to Win.
14
+
15
+ Description: Interior design suggestions with AI
16
+ Slogan: Style That Thinks With You.
17
+
18
+ Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses
19
+
20
+ Description:
21
+ {description}
22
+ Slogan:
data/slogan.csv ADDED
The diff for this file is too large to render. See raw diff
 
logic/cleaning.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import pandas as pd
3
+ import re, unicodedata
4
+ from html import unescape
5
+
6
+ MIN_LEN = 20
7
+ MAX_LEN = 60
8
+ KEEP_ASCII_ONLY = False
9
+ MIN_ALPHA_RATIO = 0.60
10
+ DROP_IF_ALL_CAPS = False
11
+
12
+ BUZZY = {
13
+ "synergy","cutting edge","cutting-edge","best in class","best-in-class",
14
+ "world class","world-class","state of the art","state-of-the-art",
15
+ "revolutionary","disruptive platform","next generation","next-gen",
16
+ "leading provider","scalable solution"
17
+ }
18
+
19
+ URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
20
+ EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
21
+ PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
22
+ WS_RE = re.compile(r"\s+")
23
+ PUNCT_RE = re.compile(r"[^\w\s]+")
24
+ TM_RE = re.compile(r"[®️©️™️]")
25
+
26
+ def _nfkc(s): return unicodedata.normalize("NFKC", s)
27
+
28
+ def _clean_text(s: str) -> str:
29
+ s = "" if s is None else str(s)
30
+ s = unescape(s)
31
+ s = _nfkc(s)
32
+ s = s.replace("\\n"," ").replace("\\r"," ")
33
+ s = TM_RE.sub("", s)
34
+ s = WS_RE.sub(" ", s).strip()
35
+ return s
36
+
37
+ def _alpha_ratio(s: str) -> float:
38
+ if not s: return 0.0
39
+ letters = sum(ch.isalpha() for ch in s)
40
+ return letters / max(1, len(s))
41
+
42
+ def _looks_shouty(s: str) -> bool:
43
+ letters = [ch for ch in s if ch.isalpha()]
44
+ if not letters: return False
45
+ uppers = sum(ch.isupper() for ch in letters)
46
+ return uppers / len(letters) >= 0.85
47
+
48
+ def _contains_buzzy(s: str) -> bool:
49
+ lo = s.lower()
50
+ return any(term in lo for term in BUZZY)
51
+
52
+ def _has_junk(s: str) -> bool:
53
+ return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))
54
+
55
+ def _ascii_only(s: str) -> bool:
56
+ try:
57
+ s.encode("ascii"); return True
58
+ except Exception:
59
+ return False
60
+
61
+ def _dupe_key(s: str) -> str:
62
+ s = s.lower()
63
+ s = re.sub(r"[^\\w\\s]+", " ", s)
64
+ s = re.sub(r"\\s+", " ", s).strip()
65
+ return s
66
+
67
+ def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
68
+ if "tagline" not in df.columns:
69
+ raise ValueError("Input must contain a 'tagline' column.")
70
+ df = df.copy()
71
+ if "description" not in df.columns:
72
+ df["description"] = df["tagline"]
73
+
74
+ df["tagline"] = df["tagline"].map(_clean_text)
75
+ df["description"] = df["description"].map(_clean_text)
76
+
77
+ df = df[(df["tagline"].str.len() > 0)]
78
+ mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
79
+ df = df[~mask_junk]
80
+
81
+ if KEEP_ASCII_ONLY:
82
+ df = df[df["tagline"].map(_ascii_only)]
83
+
84
+ df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
85
+ df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
86
+
87
+ if DROP_IF_ALL_CAPS:
88
+ df = df[~df["tagline"].map(_looks_shouty)]
89
+
90
+ df = df[~df["tagline"].map(_contains_buzzy)]
91
+
92
+ key = df["tagline"].map(_dupe_key)
93
+ df = df.loc[~key.duplicated()].reset_index(drop=True)
94
+
95
+ df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
96
+ return df
logic/search.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import json, os
3
+ import numpy as np, pandas as pd
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder
6
+
7
+ class SloganSearcher:
8
+ def _init_(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
9
+ meta_path = os.path.join(assets_dir, "meta.json")
10
+ if not os.path.exists(meta_path):
11
+ raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
12
+ with open(meta_path, "r") as f:
13
+ self.meta = json.load(f)
14
+
15
+ self.df = pd.read_parquet(os.path.join(assets_dir, "slogans_clean.parquet"))
16
+ self.index = faiss.read_index(os.path.join(assets_dir, "faiss.index"))
17
+ self.encoder = SentenceTransformer(self.meta["model_name"])
18
+
19
+ self.use_rerank = use_rerank
20
+ self.reranker = CrossEncoder(rerank_model) if use_rerank else None
21
+
22
+ self.text_col = self.meta.get("text_col", "description")
23
+ self.fallback_col = self.meta.get("fallback_col", "tagline")
24
+ self.norm = bool(self.meta.get("normalized", True))
25
+
26
+ def search(self, query: str, top_k=5, rerank_top_n=20):
27
+ if not isinstance(query, str) or len(query.strip()) == 0:
28
+ return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
29
+ q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
30
+ sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
31
+ idxs = idxs[0].tolist()
32
+ sims = sims[0].tolist()
33
+ results = self.df.iloc[idxs].copy()
34
+ results["score"] = sims
35
+ if self.use_rerank:
36
+ texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
37
+ pairs = [[query, t] for t in texts]
38
+ rr = self.reranker.predict(pairs)
39
+ results["rerank_score"] = rr
40
+ results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
41
+ else:
42
+ results = results.head(int(top_k))
43
+ results["display"] = results[self.fallback_col]
44
+ cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
45
+ return results[cols]
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- gradio
2
- transformers
3
- sentence-transformers
4
- faiss-cpu
5
- pandas
6
- numpy
 
7
  torch
8
- pyarrow
 
1
+ gradio==5.43.1
2
+ huggingface_hub>=0.23.0
3
+ sentence-transformers>=2.6.0
4
+ faiss-cpu>=1.8.0
5
+ pandas>=2.1.0
6
+ numpy>=1.26.0
7
+ pyarrow>=14.0.1
8
  torch
9
+ transformers>=4.40.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10