Spaces:

dejanseo
/

query-fanout

Running

App Files Files Community

dejanseo commited on 15 days ago

Commit

9198c88

verified ·

1 Parent(s): f10444d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +95 -22

src/streamlit_app.py CHANGED Viewed

@@ -26,11 +26,11 @@ GENERATION_CONFIG: Dict[str, Any] = {
 # ------------------ MODEL LOADING (CPU/GPU AUTO) ------------------
 @st.cache_resource
 def load_model() -> Tuple[MT5Tokenizer, MT5ForConditionalGeneration, torch.device]:
-    # Avoid CUDA initialization if no driver; select device explicitly.
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     tok = MT5Tokenizer.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR)
     model = MT5ForConditionalGeneration.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR)
     model.to(device)
     return tok, model, device
 # ------------------ GENERATION HELPERS ------------------
@@ -61,18 +61,68 @@ def avg_logprobs_from_generate(tok: MT5Tokenizer, gen) -> List[float]:
     count = torch.where(count.eq(0), torch.ones_like(count), count)
     return [(lp / c).item() for lp, c in zip(sum_logp, count)]
-# --- UPDATED sampling_generate function ---
 def sampling_generate(tok, model, device, inputs, top_n, temperature, top_p, no_repeat_ngram_size, repetition_penalty, bad_words_ids: List[List[int]] = None):
-    """Now accepts a list of 'bad_words_ids' to forbid certain sequences."""
-    kwargs = dict(max_length=MAX_TARGET_LENGTH, do_sample=True, temperature=temperature, top_p=top_p, num_return_sequences=top_n, return_dict_in_generate=True, output_scores=True)
-    if no_repeat_ngram_size > 0: kwargs["no_repeat_ngram_size"] = int(no_repeat_ngram_size)
-    if repetition_penalty != 1.0: kwargs["repetition_penalty"] = float(repetition_penalty)
-    if bad_words_ids: kwargs["bad_words_ids"] = bad_words_ids
     gen = model.generate(**inputs, **kwargs)
     return decode_sequences(tok, gen.sequences), avg_logprobs_from_generate(tok, gen)
-def normalize_text(s: str) -> str: return " ".join(s.strip().lower().split())
 # ------------------ STREAMLIT APP ------------------
 st.set_page_config(
@@ -92,22 +142,25 @@ st.title("Query Fanout Generator")
 st.markdown("Enter a URL and a query to generate a diverse set of related queries.")
 col1, col2 = st.columns(2)
 with col1:
-    url = st.text_input("URL", value="dejan.ai")
 with col2:
-    query = st.text_input("Query", value="ai seo agency")
-run_button = st.button("Generate Fan-out Queries")
-if run_button:
     cfg = GENERATION_CONFIG
     with st.spinner("Generating queries..."):
         start_ts = time.time()
         inputs = build_inputs(tok, url, query, device)
-        # --- UPDATED BATCHING LOGIC WITH `bad_words_ids` ---
         all_texts, all_scores = [], []
         seen_texts_for_bad_words = set()
@@ -117,11 +170,17 @@ if run_button:
         for i in range(num_batches):
             current_seed = cfg["seed"] + i
             torch.manual_seed(current_seed)
-            if torch.cuda.is_available(): torch.cuda.manual_seed_all(current_seed)
             bad_words_ids = None
             if seen_texts_for_bad_words:
-                bad_words_ids = tok(list(seen_texts_for_bad_words), add_special_tokens=False, padding=True, truncation=True)["input_ids"]
             batch_texts, batch_scores = sampling_generate(
                 tok, model, device, inputs,
@@ -132,15 +191,16 @@ if run_button:
                 repetition_penalty=float(cfg["repetition_penalty"]),
                 bad_words_ids=bad_words_ids
             )
             all_texts.extend(batch_texts)
             all_scores.extend(batch_scores)
             for txt in batch_texts:
-                if txt: seen_texts_for_bad_words.add(txt)
             progress_bar.progress((i + 1) / num_batches)
-        # Deduplicate and finalize the list
         final_enriched = []
         final_seen_normalized = set()
         for txt, sc in zip(all_texts, all_scores):
@@ -151,7 +211,7 @@ if run_button:
         if cfg["sort_by"] == "logp/len":
             final_enriched.sort(key=lambda x: x["logp/len"], reverse=True)
         final_enriched = final_enriched[:TOTAL_DESIRED_CANDIDATES]
         if not final_enriched:
@@ -161,3 +221,16 @@ if run_button:
             df = pd.DataFrame(output_texts, columns=["Generated Query"])
             df.index = range(1, len(df) + 1)
             st.dataframe(df, use_container_width=True)

 # ------------------ MODEL LOADING (CPU/GPU AUTO) ------------------
 @st.cache_resource
 def load_model() -> Tuple[MT5Tokenizer, MT5ForConditionalGeneration, torch.device]:
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     tok = MT5Tokenizer.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR)
     model = MT5ForConditionalGeneration.from_pretrained(MODEL_PATH, cache_dir=CACHE_DIR)
     model.to(device)
+    model.eval()
     return tok, model, device
 # ------------------ GENERATION HELPERS ------------------
     count = torch.where(count.eq(0), torch.ones_like(count), count)
     return [(lp / c).item() for lp, c in zip(sum_logp, count)]
+# --- UPDATED sampling_generate function (Deep Analysis) ---
 def sampling_generate(tok, model, device, inputs, top_n, temperature, top_p, no_repeat_ngram_size, repetition_penalty, bad_words_ids: List[List[int]] = None):
+    kwargs = dict(
+        max_length=MAX_TARGET_LENGTH,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        num_return_sequences=top_n,
+        return_dict_in_generate=True,
+        output_scores=True
+    )
+    if int(no_repeat_ngram_size) > 0:
+        kwargs["no_repeat_ngram_size"] = int(no_repeat_ngram_size)
+    if float(repetition_penalty) != 1.0:
+        kwargs["repetition_penalty"] = float(repetition_penalty)
+    if bad_words_ids:
+        kwargs["bad_words_ids"] = bad_words_ids
     gen = model.generate(**inputs, **kwargs)
     return decode_sequences(tok, gen.sequences), avg_logprobs_from_generate(tok, gen)
+def normalize_text(s: str) -> str:
+    return " ".join(s.strip().lower().split())
+# --- Beam-based quick function (from old script) ---
+def generate_expansions_beam(url: str, query: str, tok: MT5Tokenizer, model: MT5ForConditionalGeneration, device: torch.device, num_return_sequences: int = 10) -> List[str]:
+    input_text = f"For URL: {url} diversify query: {query}"
+    inputs = tok(input_text, max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_length=MAX_TARGET_LENGTH,
+            num_return_sequences=num_return_sequences,
+            num_beams=num_return_sequences * 2,
+            num_beam_groups=num_return_sequences,
+            diversity_penalty=0.5,
+            temperature=0.8,
+            do_sample=False,
+            early_stopping=True,
+            pad_token_id=tok.pad_token_id,
+            eos_token_id=tok.eos_token_id,
+            forced_eos_token_id=tok.eos_token_id,
+            max_new_tokens=MAX_TARGET_LENGTH,
+        )
+    # Decode and simple post-filter
+    expansions: List[str] = []
+    for seq in outputs:
+        s = tok.decode(seq, skip_special_tokens=True)
+        if s and normalize_text(s) != normalize_text(query):
+            expansions.append(s)
+    # Deduplicate preserve order
+    seen = set()
+    uniq = []
+    for s in expansions:
+        if s not in seen:
+            seen.add(s)
+            uniq.append(s)
+    return uniq
 # ------------------ STREAMLIT APP ------------------
 st.set_page_config(
 st.markdown("Enter a URL and a query to generate a diverse set of related queries.")
 col1, col2 = st.columns(2)
 with col1:
+    url = st.text_input("URL", value="dejan.ai", help="Target URL that provides context for the query.")
 with col2:
+    query = st.text_input("Query", value="ai seo agency", help="The search query you want to expand.")
+# --- Two actions side by side ---
+bcol1, bcol2 = st.columns(2)
+with bcol1:
+    deep_btn = st.button("Deep Analysis")
+with bcol2:
+    quick_btn = st.button("Quick Fan-Out")
+# ---- Deep Analysis path (sampling, large batches) ----
+if deep_btn:
     cfg = GENERATION_CONFIG
     with st.spinner("Generating queries..."):
         start_ts = time.time()
         inputs = build_inputs(tok, url, query, device)
         all_texts, all_scores = [], []
         seen_texts_for_bad_words = set()
         for i in range(num_batches):
             current_seed = cfg["seed"] + i
             torch.manual_seed(current_seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed_all(current_seed)
             bad_words_ids = None
             if seen_texts_for_bad_words:
+                bad_words_ids = tok(
+                    list(seen_texts_for_bad_words),
+                    add_special_tokens=False,
+                    padding=True,
+                    truncation=True
+                )["input_ids"]
             batch_texts, batch_scores = sampling_generate(
                 tok, model, device, inputs,
                 repetition_penalty=float(cfg["repetition_penalty"]),
                 bad_words_ids=bad_words_ids
             )
             all_texts.extend(batch_texts)
             all_scores.extend(batch_scores)
             for txt in batch_texts:
+                if txt:
+                    seen_texts_for_bad_words.add(txt)
             progress_bar.progress((i + 1) / num_batches)
+        # Deduplicate and finalize
         final_enriched = []
         final_seen_normalized = set()
         for txt, sc in zip(all_texts, all_scores):
         if cfg["sort_by"] == "logp/len":
             final_enriched.sort(key=lambda x: x["logp/len"], reverse=True)
         final_enriched = final_enriched[:TOTAL_DESIRED_CANDIDATES]
         if not final_enriched:
             df = pd.DataFrame(output_texts, columns=["Generated Query"])
             df.index = range(1, len(df) + 1)
             st.dataframe(df, use_container_width=True)
+# ---- Quick Fan-Out path (beam-based, small and simple) ----
+if quick_btn:
+    with st.spinner("Generating quick fan-out..."):
+        start_time = time.time()
+        expansions = generate_expansions_beam(url, query, tok, model, device, num_return_sequences=10)
+    if expansions:
+        df = pd.DataFrame(expansions, columns=["Generated Query"])
+        df.index = range(1, len(df) + 1)
+        st.dataframe(df, use_container_width=True)
+    else:
+        st.warning("No valid fan-outs generated. Try a different query.")