Spaces:

dejanseo
/

linkbert

Sleeping

App Files Files Community

dejanseo commited on 8 days ago

Commit

a92f9e3

verified ·

1 Parent(s): df3962f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +39 -76

src/streamlit_app.py CHANGED Viewed

@@ -8,28 +8,21 @@ import trafilatura
 # Streamlit config
 st.set_page_config(layout="wide", page_title="LinkBERT")
-# Load tokenizer & model
 MODEL_ID = "dejanseo/LinkBERT-XL"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-# Determine the device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the model directly to the determined device
-# Avoid device_map="auto" if it's causing meta tensor issues with certain torch/transformers versions.
-# Load to CPU first, then move to GPU if available.
-model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
-# Explicitly move model to the determined device and dtype
-if device == "cuda":
-    model.half().to(device) # Use .half() for float16 on GPU
-else:
-    model.to(device) # For CPU, typically stick to float32 unless model was specifically trained with bfloat16 for CPU
 model.eval()
-# Functions (rest of your functions remain mostly the same)
 def tokenize_with_indices(text: str):
     encoded = tokenizer.encode_plus(
         text,
@@ -48,14 +41,15 @@ def fetch_and_extract_content(url: str):
     return None
 def process_text(inputs: str, confidence_threshold: float):
-    max_chunk_length = 512 - 2  # safe window for special tokens
     words = inputs.split()
     chunk_texts = []
     current_chunk, current_length = [], 0
     for word in words:
         tok_len = len(tokenizer.tokenize(word))
         if tok_len + current_length > max_chunk_length:
-            chunk_texts.append(" ".join(current_chunk))
             current_chunk = [word]
             current_length = tok_len
         else:
@@ -71,85 +65,61 @@ def process_text(inputs: str, confidence_threshold: float):
     with torch.no_grad():
         for chunk in chunk_texts:
             input_ids, token_offsets = tokenize_with_indices(chunk)
-            # Ensure input_ids_tensor is on the same device as the model
-            input_ids_tensor = torch.tensor(input_ids).unsqueeze(0).to(model.device)
             outputs = model(input_ids_tensor)
             logits = outputs.logits  # [1, seq_len, num_labels]
             predictions = torch.argmax(logits, dim=-1).squeeze(0).tolist()
             softmax_scores = F.softmax(logits, dim=-1).squeeze(0).tolist()
-            # The rest of your processing logic
             word_info = {}
             for idx, (start, end) in enumerate(token_offsets):
                 if idx == 0 or idx == len(token_offsets) - 1:
-                    continue  # skip specials
                 word_start = start
-                # Find the actual start of the word corresponding to this token
-                # This logic assumes space-separated words for the most part
-                while word_start > 0 and chunk[word_start - 1] not in [' ', '\n', '\t']:
                     word_start -= 1
-                # If a word_start maps to multiple tokens (e.g., "don't" -> ["don", "'", "t"])
-                # ensure we pick the earliest start for that conceptual word
-                while word_start > 0 and (chunk[word_start-1:word_start] == ' ' or tokenizer.decode(tokenizer.encode(chunk[word_start-1:end], add_special_tokens=False))[0] == chunk[word_start-1]):
-                     word_start -= 1
-                # Use a tuple (word_start, actual_word_text_from_chunk) as key for more robust aggregation
-                # For simplicity here, we stick to word_start
                 if word_start not in word_info:
-                    # Initialize with default for "not link"
                     word_info[word_start] = {"prediction": 0, "confidence": 0.0, "subtokens": []}
                 conf_pct = softmax_scores[idx][predictions[idx]] * 100.0
-                # Only mark as 1 if the current token's prediction is 1 AND confidence meets threshold
                 if predictions[idx] == 1 and conf_pct >= confidence_threshold:
-                    word_info[word_start]["prediction"] = 1 # Mark the whole 'word' as a link
-                # Keep the max confidence for any token within the 'word'
                 word_info[word_start]["confidence"] = max(word_info[word_start]["confidence"], conf_pct)
                 word_info[word_start]["subtokens"].append((start, end, chunk[start:end]))
             last_end = 0
-            # Sort by word_start to maintain order
             for word_start in sorted(word_info.keys()):
                 word_data = word_info[word_start]
-                # Sort subtokens to ensure they are processed in order within a word
-                for subtoken_start, subtoken_end, subtoken_text in sorted(word_data["subtokens"], key=lambda x: x[0]):
                     escaped = subtoken_text.replace("$", "\\$")
-                    # Add any text between the last processed token and the current one
                     if last_end < subtoken_start:
                         reconstructed_text += chunk[last_end:subtoken_start]
                     if word_data["prediction"] == 1:
-                        # Apply highlight to the subtoken
                         reconstructed_text += (
-                            f"<span style='background-color: rgba(0, 255, 0, 0.5); display: inline;'>{escaped}</span>" # Added alpha for better readability
                         )
                     else:
-                        reconstructed_text += escaped # No highlight
                     last_end = subtoken_end
-                    # For DataFrame, append the info for each *subtoken*
                     df_data["Word"].append(escaped)
-                    df_data["Prediction"].append(word_data["prediction"]) # Prediction applies to the whole conceptual word
-                    df_data["Confidence"].append(word_data["confidence"]) # Confidence applies to the whole conceptual word
                     df_data["Start"].append(subtoken_start + original_position_offset)
                     df_data["End"].append(subtoken_end + original_position_offset)
-            # Add any remaining text from the current chunk after the last token
-            if last_end < len(chunk):
-                reconstructed_text += chunk[last_end:].replace("$", "\\$")
-            # Update offset for the next chunk. Add 1 for the space that was implicitly there.
-            original_position_offset += len(chunk) + 1
     df_tokens = pd.DataFrame(df_data)
     return reconstructed_text, df_tokens
-# UI (remains the same)
 st.title("LinkBERT")
 st.markdown("""
 LinkBERT predicts natural link placement within web content. Enter text or a URL for extraction. Increase the threshold to reduce link predictions.
@@ -160,29 +130,22 @@ confidence_threshold = st.slider("Confidence Threshold", 50, 100, 50)
 tab1, tab2 = st.tabs(["Text Input", "URL Input"])
 with tab1:
-    user_input = st.text_area("Enter text to process:", height=200) # Added height for better UX
     if st.button("Process Text"):
-        if user_input: # Ensure input is not empty
-            highlighted_text, df_tokens = process_text(user_input, confidence_threshold)
-            st.markdown(highlighted_text, unsafe_allow_html=True)
-            st.dataframe(df_tokens)
-        else:
-            st.warning("Please enter some text to process.")
 with tab2:
-    url_input = st.text_input("Enter URL to process:", value="https://dejan.ai/blog/gpt-5-made-seo-irreplaceable/") # Pre-fill with example
     if st.button("Fetch and Process"):
-        if url_input: # Ensure URL input is not empty
-            with st.spinner("Fetching and processing content..."):
-                content = fetch_and_extract_content(url_input)
-                if content:
-                    highlighted_text, df_tokens = process_text(content, confidence_threshold)
-                    st.markdown(highlighted_text, unsafe_allow_html=True)
-                    st.dataframe(df_tokens)
-                else:
-                    st.error("Could not fetch content from the URL. Please check the URL and try again.")
         else:
-            st.warning("Please enter a URL to process.")
 st.divider()
 st.markdown("""
@@ -202,4 +165,4 @@ LinkBERT was fine-tuned on a dataset of organic web content and editorial links.
 Interested in using this in an automated pipeline for bulk link prediction?
 Please [book an appointment](https://dejanmarketing.com/conference/).
-""")

 # Streamlit config
 st.set_page_config(layout="wide", page_title="LinkBERT")
+# Model setup (load fully to avoid meta tensors)
 MODEL_ID = "dejanseo/LinkBERT-XL"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+# Force materialized tensors on CPU, then move — avoids meta tensors
+model = AutoModelForTokenClassification.from_pretrained(
+    MODEL_ID,
+    low_cpu_mem_usage=False,   # important: materialize weights
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+model.to(device)
 model.eval()
+# Functions
 def tokenize_with_indices(text: str):
     encoded = tokenizer.encode_plus(
         text,
     return None
 def process_text(inputs: str, confidence_threshold: float):
+    max_chunk_length = 512 - 2  # leave room for specials
     words = inputs.split()
     chunk_texts = []
     current_chunk, current_length = [], 0
     for word in words:
         tok_len = len(tokenizer.tokenize(word))
         if tok_len + current_length > max_chunk_length:
+            if current_chunk:
+                chunk_texts.append(" ".join(current_chunk))
             current_chunk = [word]
             current_length = tok_len
         else:
     with torch.no_grad():
         for chunk in chunk_texts:
             input_ids, token_offsets = tokenize_with_indices(chunk)
+            # Build tensors on correct device; no meta usage
+            input_ids_tensor = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)
             outputs = model(input_ids_tensor)
             logits = outputs.logits  # [1, seq_len, num_labels]
             predictions = torch.argmax(logits, dim=-1).squeeze(0).tolist()
             softmax_scores = F.softmax(logits, dim=-1).squeeze(0).tolist()
             word_info = {}
             for idx, (start, end) in enumerate(token_offsets):
                 if idx == 0 or idx == len(token_offsets) - 1:
+                    continue  # skip special tokens
                 word_start = start
+                while word_start > 0 and chunk[word_start - 1] != ' ':
                     word_start -= 1
                 if word_start not in word_info:
                     word_info[word_start] = {"prediction": 0, "confidence": 0.0, "subtokens": []}
                 conf_pct = softmax_scores[idx][predictions[idx]] * 100.0
                 if predictions[idx] == 1 and conf_pct >= confidence_threshold:
+                    word_info[word_start]["prediction"] = 1
                 word_info[word_start]["confidence"] = max(word_info[word_start]["confidence"], conf_pct)
                 word_info[word_start]["subtokens"].append((start, end, chunk[start:end]))
             last_end = 0
             for word_start in sorted(word_info.keys()):
                 word_data = word_info[word_start]
+                for subtoken_start, subtoken_end, subtoken_text in word_data["subtokens"]:
                     escaped = subtoken_text.replace("$", "\\$")
                     if last_end < subtoken_start:
                         reconstructed_text += chunk[last_end:subtoken_start]
                     if word_data["prediction"] == 1:
                         reconstructed_text += (
+                            f"<span style='background-color: rgba(0, 255, 0); display: inline;'>{escaped}</span>"
                         )
                     else:
+                        reconstructed_text += escaped
                     last_end = subtoken_end
                     df_data["Word"].append(escaped)
+                    df_data["Prediction"].append(word_data["prediction"])
+                    df_data["Confidence"].append(word_info[word_start]["confidence"])
                     df_data["Start"].append(subtoken_start + original_position_offset)
                     df_data["End"].append(subtoken_end + original_position_offset)
+                original_position_offset += len(chunk) + 1
+            reconstructed_text += chunk[last_end:].replace("$", "\\$")
     df_tokens = pd.DataFrame(df_data)
     return reconstructed_text, df_tokens
+# UI
 st.title("LinkBERT")
 st.markdown("""
 LinkBERT predicts natural link placement within web content. Enter text or a URL for extraction. Increase the threshold to reduce link predictions.
 tab1, tab2 = st.tabs(["Text Input", "URL Input"])
 with tab1:
+    user_input = st.text_area("Enter text to process:")
     if st.button("Process Text"):
+        highlighted_text, df_tokens = process_text(user_input, confidence_threshold)
+        st.markdown(highlighted_text, unsafe_allow_html=True)
+        st.dataframe(df_tokens)
 with tab2:
+    url_input = st.text_input("Enter URL to process:")
     if st.button("Fetch and Process"):
+        content = fetch_and_extract_content(url_input)
+        if content:
+            highlighted_text, df_tokens = process_text(content, confidence_threshold)
+            st.markdown(highlighted_text, unsafe_allow_html=True)
+            st.dataframe(df_tokens)
         else:
+            st.error("Could not fetch content from the URL. Please check the URL and try again.")
 st.divider()
 st.markdown("""
 Interested in using this in an automated pipeline for bulk link prediction?
 Please [book an appointment](https://dejanmarketing.com/conference/).
+""")