Spaces:

asaf1602
/

sloganAI

Sleeping

App Files Files Community

asaf1602 commited on 1 day ago

Commit

06a5663

verified ·

1 Parent(s): c17e99d

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +269 -145
requirements.txt +8 -6

app.py CHANGED Viewed

@@ -1,153 +1,277 @@
 import gradio as gr
-import pandas as pd
-import numpy as np
-import faiss, re, torch
-from sentence_transformers import SentenceTransformer, CrossEncoder
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# ------------------ Models ------------------
-GEN_TOK   = AutoTokenizer.from_pretrained("google/flan-t5-large")
-GEN_MODEL = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-GEN_MODEL = GEN_MODEL.to(DEVICE)
-EMBED_MODEL = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
-RERANKER    = CrossEncoder("cross-encoder/stsb-roberta-base")
-# ------------------ Dummy dataset (for demo) ------------------
-data = pd.DataFrame({
-    "name": ["HowDidIDo", "Museotainment", "Movitr"],
-    "tagline": ["Online evaluation platform", "PacMan & Louvre meet", "Crowdsourced video translation"],
-    "description": [
-        "Public speaking, Presentation skills and interview practice",
-        "Interactive AR museum tours",
-        "Video translation with voice and subtitles"
-    ]
-})
-# Build FAISS index
-data_vecs = EMBED_MODEL.encode(data["description"].tolist())
-faiss.normalize_L2(data_vecs)
-index = faiss.IndexFlatIP(data_vecs.shape[1])
-index.add(data_vecs)
-def recommend(query, top_k=3):
-    query_vec = EMBED_MODEL.encode([query])
-    faiss.normalize_L2(query_vec)
-    scores, idx = index.search(query_vec, top_k)
-    results = data.iloc[idx[0]].copy()
-    results["score"] = scores[0]
-    return results[["name", "tagline", "description", "score"]]
-# ------------------ Helpers ------------------
-BLOCK_PATTERNS = [
-    r"^[A-Z][a-z]+ [A-Z][a-z]+ (Platform|Solution|System|Application|Marketplace)$",
-    r"^[A-Z][a-z]+ [A-Z][a-z]+$",
-    r"^[A-Z][a-z]+$",
-]
-HARD_BLOCK_WORDS = {"platform","solution","system","application","marketplace",
-    "ai-powered","ai powered","empower","empowering",
-    "artificial intelligence","machine learning","augmented reality","virtual reality"}
-GENERIC_WORDS = {"app","assistant","smart","ai","ml","ar","vr","decentralized","blockchain"}
-MARKETING_VERBS = {"build","grow","simplify","discover","create","connect","transform","unlock","boost","learn"}
-BENEFIT_WORDS   = {"faster","smarter","easier","better","safer","clearer"}
-def _clean_slogan(text: str, max_words: int = 8) -> str:
-    text = text.strip().split("\n")[0]
-    text = re.sub(r"[\"“”‘’]", "", text)
-    text = re.sub(r"\s+", " ", text).strip()
-    words = text.split()
-    if len(words) > max_words:
-        text = " ".join(words[:max_words])
-    return text
-def _is_blocked_slogan(s: str) -> bool:
-    s_low = s.lower()
-    if any(w in s_low for w in HARD_BLOCK_WORDS):
-        return True
-    for pat in BLOCK_PATTERNS:
-        if re.match(pat, s.strip()):
-            return True
-    return False
-def _score_candidates(query: str, cands: list) -> list:
-    if not cands:
-        return []
-    ce_scores = np.asarray(RERANKER.predict([(query, s) for s in cands]), dtype=np.float32) / 5.0
-    results = []
-    for i, s in enumerate(cands):
-        words = s.split()
-        brevity = 1.0 - min(1.0, abs(len(words) - 5) / 5.0)
-        marketing = 0.2*len(set(words) & MARKETING_VERBS) + 0.2*len(set(words) & BENEFIT_WORDS)
-        score = 0.6*float(ce_scores[i]) + 0.2*brevity + 0.2*marketing
-        results.append((s, float(score)))
-    return results
-# ------------------ Generator ------------------
-def generate_slogan(query_text: str, n_samples: int = 16) -> str:
-    prompt = (
-        "You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).\n"
-        "Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.\n"
-        "Focus on benefits and vivid verbs. Do not copy the description.\n\n"
-        f"Description: {query_text}\nSlogans:"
     )
-    input_ids = GEN_TOK(prompt, return_tensors="pt").input_ids.to(DEVICE)
-    outputs = GEN_MODEL.generate(
-        input_ids,
-        max_new_tokens=24,
         do_sample=True,
-        top_k=60,
-        top_p=0.92,
-        temperature=1.2,
-        num_return_sequences=n_samples
     )
-    raw_cands = [GEN_TOK.decode(o, skip_special_tokens=True) for o in outputs]
-    cand_set = set()
-    for txt in raw_cands:
-        for line in txt.split("\n"):
-            s = _clean_slogan(line)
-            if not s: continue
-            if len(s.split()) < 2 or len(s.split()) > 8: continue
-            if _is_blocked_slogan(s): continue
-            cand_set.add(s.capitalize())
-    if not cand_set:
-        return "Fresh Ideas, Built To Scale"
-    scored = _score_candidates(query_text, sorted(cand_set))
-    scored.sort(key=lambda x: x[1], reverse=True)
-    return scored[0][0] if scored else "Fresh Ideas, Built To Scale"
-# ------------------ Pipeline ------------------
-def pipeline(user_input):
-    recs = recommend(user_input, top_k=3)
-    slogan = generate_slogan(user_input)
-    recs = recs.reset_index(drop=True)
-    recs.loc[len(recs)] = ["Generated Slogan", slogan, user_input, np.nan]
-    return recs
-# ------------------ Gradio UI ------------------
-examples = [
-    "AI coach for improving public speaking skills",
-    "Augmented reality app for interactive museum tours",
-    "Voice-controlled task manager for remote teams",
-    "Machine learning system for predicting crop yields",
-    "Platform for AI-assisted interior design suggestions"
-]
-demo = gr.Interface(
-    fn=pipeline,
-    inputs=gr.Textbox(label="Enter a startup description"),
-    outputs=gr.Dataframe(headers=["Name", "Tagline", "Description", "Score"]),
-    examples=examples,
-    title="SloganAI – Startup Recommendation & Slogan Generator",
-    description="Enter a startup idea and get top-3 similar startups + 1 generated slogan."
-)
-if __name__ == "__main__":
-    demo.launch()

+\
+import os, json, numpy as np, pandas as pd
 import gradio as gr
+import faiss
+import re
+from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from logic.cleaning import clean_dataframe
+from logic.search import SloganSearcher
+# -------------------- Config --------------------
+ASSETS_DIR   = "assets"
+DATA_PATH    = "data/slogan.csv"
+PROMPT_PATH  = "data/prompt.txt"
+MODEL_NAME   = "sentence-transformers/all-MiniLM-L6-v2"
+NORMALIZE    = True
+GEN_MODEL    = "google/flan-t5-base"
+NUM_GEN_CANDIDATES = 12
+MAX_NEW_TOKENS     = 18
+TEMPERATURE        = 0.7
+TOP_P              = 0.9
+REPETITION_PENALTY = 1.15
+# choose the most relevant yet non-duplicate candidate
+RELEVANCE_WEIGHT   = 0.7
+NOVELTY_WEIGHT     = 0.3
+DUPLICATE_MAX_SIM  = 0.92
+NOVELTY_SIM_THRESHOLD = 0.80  # keep some distance from retrieved
+META_PATH    = os.path.join(ASSETS_DIR, "meta.json")
+PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
+INDEX_PATH   = os.path.join(ASSETS_DIR, "faiss.index")
+EMB_PATH     = os.path.join(ASSETS_DIR, "embeddings.npy")
+def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
+# -------------------- Asset build --------------------
+def _build_assets():
+    if not os.path.exists(DATA_PATH):
+        raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
+    os.makedirs(ASSETS_DIR, exist_ok=True)
+    _log(f"Loading dataset: {DATA_PATH}")
+    df = pd.read_csv(DATA_PATH)
+    _log(f"Rows before cleaning: {len(df)}")
+    df = clean_dataframe(df)
+    _log(f"Rows after cleaning: {len(df)}")
+    if "description" in df.columns and df["description"].notna().any():
+        texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
+        text_col, fallback_col = "description", "tagline"
+    else:
+        texts = df["tagline"].astype(str).tolist()
+        text_col, fallback_col = "tagline", "tagline"
+    _log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
+    encoder = SentenceTransformer(MODEL_NAME)
+    emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)
+    dim = emb.shape[1]
+    index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
+    index.add(emb)
+    _log("Persisting assets …")
+    df.to_parquet(PARQUET_PATH, index=False)
+    faiss.write_index(index, INDEX_PATH)
+    np.save(EMB_PATH, emb)
+    meta = {
+        "model_name": MODEL_NAME,
+        "dim": int(dim),
+        "normalized": NORMALIZE,
+        "metric": "ip" if NORMALIZE else "l2",
+        "row_count": int(len(df)),
+        "text_col": text_col,
+        "fallback_col": fallback_col,
+    }
+    with open(META_PATH, "w") as f:
+        json.dump(meta, f, indent=2)
+    _log("Assets built successfully.")
+def _ensure_assets():
+    need = False
+    for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
+        if not os.path.exists(p):
+            _log(f"Missing asset: {p}")
+            need = True
+    if need:
+        _log("Building assets from scratch …")
+        _build_assets()
+        return
+    try:
+        pd.read_parquet(PARQUET_PATH)
+    except Exception as e:
+        _log(f"Parquet read failed ({e}); rebuilding assets.")
+        _build_assets()
+# Build before UI
+_ensure_assets()
+# -------------------- Retrieval --------------------
+searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
+meta     = json.load(open(META_PATH))
+_encoder = SentenceTransformer(meta["model_name"])
+# -------------------- Generator --------------------
+_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
+_gen_model     = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)
+# keep this list small so we don't nuke relevant outputs
+_BANNED_TERMS = {"portal", "e-commerce", "ecommerce", "shopping", "shop"}
+_PUNCT_CHARS = ":;—–-,.!?“”\"'`"
+_PUNCT_RE = re.compile(f"[{re.escape(_PUNCT_CHARS)}]")
+_MIN_WORDS, _MAX_WORDS = 2, 8
+def _load_prompt():
+    if os.path.exists(PROMPT_PATH):
+        with open(PROMPT_PATH, "r", encoding="utf-8") as f:
+            return f.read()
+    return (
+        "You are a professional slogan writer.\n"
+        "Write ONE original startup slogan under 8 words, Title Case, no punctuation.\n"
+        "Do not copy examples.\n"
+        "Description:\n{description}\nSlogan:"
     )
+def _render_prompt(description: str, retrieved=None) -> str:
+    tmpl = _load_prompt()
+    if "{description}" in tmpl:
+        prompt = tmpl.replace("{description}", description)
+    else:
+        prompt = f"{tmpl}\n\nDescription:\n{description}\nSlogan:"
+    if retrieved:
+        prompt += "\n\nDo NOT copy these existing slogans:\n"
+        for s in retrieved[:3]:
+            prompt += f"- {s}\n"
+    return prompt
+def _title_case(s: str) -> str:
+    small = {"and","or","for","of","the","to","in","on","with","a","an"}
+    words = [w for w in s.split() if w]
+    out = []
+    for i,w in enumerate(words):
+        lw = w.lower()
+        if i>0 and lw in small: out.append(lw)
+        else: out.append(lw.capitalize())
+    return " ".join(out)
+def _strip_punct(s: str) -> str:
+    return _PUNCT_RE.sub("", s)
+def _strict_ok(s: str) -> bool:
+    if not s: return False
+    wc = len(s.split())
+    if wc < _MIN_WORDS or wc > _MAX_WORDS: return False
+    lo = s.lower()
+    if any(term in lo for term in _BANNED_TERMS): return False
+    if lo in {"the","a","an"}: return False
+    return True
+def _postprocess_strict(texts):
+    cleaned, seen = [], set()
+    for t in texts:
+        s = t.replace("Slogan:", "").strip().strip('"').strip("'")
+        s = " ".join(s.split())
+        s = _strip_punct(s)          # remove punctuation instead of rejecting
+        s = _title_case(s)
+        if _strict_ok(s):
+            k = s.lower()
+            if k not in seen:
+                seen.add(k); cleaned.append(s)
+    return cleaned
+def _postprocess_relaxed(texts):
+    # fallback if strict returns nothing: keep 2–8 words, strip punctuation, Title Case
+    cleaned, seen = [], set()
+    for t in texts:
+        s = t.strip().strip('"').strip("'")
+        s = _strip_punct(s)
+        s = " ".join(s.split())
+        wc = len(s.split())
+        if _MIN_WORDS <= wc <= _MAX_WORDS:
+            s = _title_case(s)
+            k = s.lower()
+            if k not in seen:
+                seen.add(k); cleaned.append(s)
+    return cleaned
+def _generate_candidates(description: str, retrieved_texts, n: int = NUM_GEN_CANDIDATES):
+    prompt = _render_prompt(description, retrieved_texts)
+    # only block very generic junk at decode time
+    bad_ids = _gen_tokenizer(list(_BANNED_TERMS), add_special_tokens=False).input_ids
+    inputs = _gen_tokenizer([prompt], return_tensors="pt", padding=True, truncation=True)
+    outputs = _gen_model.generate(
+        **inputs,
         do_sample=True,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        num_return_sequences=n,
+        max_new_tokens=MAX_NEW_TOKENS,
+        no_repeat_ngram_size=3,
+        repetition_penalty=REPETITION_PENALTY,
+        bad_words_ids=bad_ids if bad_ids else None,
+        eos_token_id=_gen_tokenizer.eos_token_id,
     )
+    texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    cands = _postprocess_strict(texts)
+    if not cands:
+        cands = _postprocess_relaxed(texts)  # <- graceful fallback
+    return cands
+def _pick_best(candidates, retrieved_texts, description):
+    """Weighted relevance to description minus duplication vs retrieved."""
+    if not candidates:
+        return None
+    c_emb = _encoder.encode(candidates, convert_to_numpy=True, normalize_embeddings=True)
+    d_emb = _encoder.encode([description], convert_to_numpy=True, normalize_embeddings=True)[0]
+    rel = c_emb @ d_emb  # cosine sim to description
+    if retrieved_texts:
+        R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True)
+        dup = np.max(R @ c_emb.T, axis=0)  # max sim to any retrieved
+    else:
+        dup = np.zeros(len(candidates), dtype=np.float32)
+    # penalize near-duplicates outright
+    mask = dup < DUPLICATE_MAX_SIM
+    if mask.any():
+        scores = RELEVANCE_WEIGHT * rel[mask] - NOVELTY_WEIGHT * dup[mask]
+        best_idx = np.argmax(scores)
+        return [c for i, c in enumerate(candidates) if mask[i]][best_idx]
+    # else: pick most relevant that still clears a basic novelty bar, else top score
+    scores = RELEVANCE_WEIGHT * rel - NOVELTY_WEIGHT * dup
+    order = np.argsort(-scores)
+    for i in order:
+        if dup[i] < NOVELTY_SIM_THRESHOLD:
+            return candidates[i]
+    return candidates[order[0]]
+# -------------------- Inference pipeline --------------------
+def run_pipeline(user_description: str):
+    if not user_description or not user_description.strip():
+        return "Please enter a description."
+    retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
+    retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
+    gens = _generate_candidates(user_description, retrieved_texts, NUM_GEN_CANDIDATES)
+    chosen = _pick_best(gens, retrieved_texts, user_description) or (gens[0] if gens else "—")
+    lines = []
+    lines.append("### 🔎 Top 3 similar slogans")
+    if retrieved_texts:
+        for i, s in enumerate(retrieved_texts, 1):
+            lines.append(f"{i}. {s}")
+    else:
+        lines.append("No similar slogans found.")
+    lines.append("\n### ✨ AI-generated suggestion")
+    lines.append(chosen)
+    return "\n".join(lines)
+# -------------------- UI --------------------
+with gr.Blocks(title="Slogan Finder") as demo:
+    gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
+    query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
+    btn = gr.Button("Get slogans", variant="primary")
+    out = gr.Markdown()
+    btn.click(run_pipeline, inputs=[query], outputs=out)
+demo.queue(max_size=64).launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
-gradio
-transformers
-sentence-transformers
-faiss-cpu
-pandas
-numpy
 torch

+gradio==5.43.1
+huggingface_hub>=0.23.0
+sentence-transformers>=2.6.0
+faiss-cpu>=1.8.0
+pandas>=2.1.0
+numpy>=1.26.0
+pyarrow>=14.0.1
 torch
+transformers>=4.40.0