# import requests # import logging # import duckdb # import numpy as np # from torch import cuda # from gradio_huggingfacehub_search import HuggingfaceHubSearch # from bertopic import BERTopic # from bertopic.representation import KeyBERTInspired # from umap import UMAP # from hdbscan import HDBSCAN # from sklearn.feature_extraction.text import CountVectorizer # from sentence_transformers import SentenceTransformer # from dotenv import load_dotenv # import os # import spaces # import gradio as gr # """ # TODOs: # - Try for small dataset <1000 rows # """ # load_dotenv() # HF_TOKEN = os.getenv("HF_TOKEN") # assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables" # logging.basicConfig( # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" # ) # MAX_ROWS = 5_000 # CHUNK_SIZE = 1_000 # session = requests.Session() # sentence_model = SentenceTransformer("all-MiniLM-L6-v2") # keybert = KeyBERTInspired() # vectorizer_model = CountVectorizer(stop_words="english") # representation_model = KeyBERTInspired() # global_topic_model = None # def get_split_rows(dataset, config, split): # config_size = session.get( # f"https://datasets-server.huggingface.co/size?dataset={dataset}&config={config}", # timeout=20, # ).json() # if "error" in config_size: # raise Exception(f"Error fetching config size: {config_size['error']}") # split_size = next( # (s for s in config_size["size"]["splits"] if s["split"] == split), # None, # ) # if split_size is None: # raise Exception(f"Error fetching split {split} in config {config}") # return split_size["num_rows"] # def get_parquet_urls(dataset, config, split): # parquet_files = session.get( # f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}&split={split}", # timeout=20, # ).json() # if "error" in parquet_files: # raise Exception(f"Error fetching parquet files: {parquet_files['error']}") # parquet_urls = [file["url"] for file in parquet_files["parquet_files"]] # logging.debug(f"Parquet files: {parquet_urls}") # return ",".join(f"'{url}'" for url in parquet_urls) # def get_docs_from_parquet(parquet_urls, column, offset, limit): # SQL_QUERY = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};" # df = duckdb.sql(SQL_QUERY).to_df() # logging.debug(f"Dataframe: {df.head(5)}") # return df[column].tolist() # @spaces.GPU # def calculate_embeddings(docs): # return sentence_model.encode(docs, show_progress_bar=True, batch_size=32) # def calculate_n_neighbors_and_components(n_rows): # n_neighbors = min(max(n_rows // 20, 15), 100) # n_components = 10 if n_rows > 1000 else 5 # Higher components for larger datasets # return n_neighbors, n_components # @spaces.GPU # def fit_model(docs, embeddings, n_neighbors, n_components): # global global_topic_model # umap_model = UMAP( # n_neighbors=n_neighbors, # n_components=n_components, # min_dist=0.0, # metric="cosine", # random_state=42, # ) # hdbscan_model = HDBSCAN( # min_cluster_size=max( # 5, n_neighbors // 2 # ), # Reducing min_cluster_size for fewer outliers # metric="euclidean", # cluster_selection_method="eom", # prediction_data=True, # ) # new_model = BERTopic( # language="english", # # Sub-models # embedding_model=sentence_model, # umap_model=umap_model, # hdbscan_model=hdbscan_model, # representation_model=representation_model, # vectorizer_model=vectorizer_model, # # Hyperparameters # top_n_words=10, # verbose=True, # min_topic_size=n_neighbors, # Coherent with n_neighbors? # ) # logging.info("Fitting new model") # new_model.fit(docs, embeddings) # logging.info("End fitting new model") # global_topic_model = new_model # logging.info("Global model updated") # def generate_topics(dataset, config, split, column, nested_column): # logging.info( # f"Generating topics for {dataset} with config {config} {split} {column} {nested_column}" # ) # parquet_urls = get_parquet_urls(dataset, config, split) # split_rows = get_split_rows(dataset, config, split) # logging.info(f"Split rows: {split_rows}") # limit = min(split_rows, MAX_ROWS) # n_neighbors, n_components = calculate_n_neighbors_and_components(limit) # reduce_umap_model = UMAP( # n_neighbors=n_neighbors, # n_components=2, # For visualization, keeping it at 2 (2D) # min_dist=0.0, # metric="cosine", # random_state=42, # ) # offset = 0 # rows_processed = 0 # base_model = None # all_docs = [] # reduced_embeddings_list = [] # topics_info, topic_plot = None, None # yield ( # gr.DataFrame(value=[], interactive=False, visible=True), # gr.Plot(value=None, visible=True), # gr.Label( # {f"⚙️ Generating topics {dataset}": rows_processed / limit}, visible=True # ), # ) # while offset < limit: # docs = get_docs_from_parquet(parquet_urls, column, offset, CHUNK_SIZE) # if not docs: # break # logging.info( # f"----> Processing chunk: {offset=} {CHUNK_SIZE=} with {len(docs)} docs" # ) # embeddings = calculate_embeddings(docs) # fit_model(docs, embeddings, n_neighbors, n_components) # if base_model is None: # base_model = global_topic_model # else: # updated_model = BERTopic.merge_models([base_model, global_topic_model]) # nr_new_topics = len(set(updated_model.topics_)) - len( # set(base_model.topics_) # ) # new_topics = list(updated_model.topic_labels_.values())[-nr_new_topics:] # logging.info(f"The following topics are newly found: {new_topics}") # base_model = updated_model # reduced_embeddings = reduce_umap_model.fit_transform(embeddings) # reduced_embeddings_list.append(reduced_embeddings) # all_docs.extend(docs) # topics_info = base_model.get_topic_info() # topic_plot = base_model.visualize_documents( # all_docs, # reduced_embeddings=np.vstack(reduced_embeddings_list), # custom_labels=True, # ) # rows_processed += len(docs) # progress = min(rows_processed / limit, 1.0) # logging.info(f"Progress: {progress} % - {rows_processed} of {limit}") # yield ( # topics_info, # topic_plot, # gr.Label({f"⚙️ Generating topics {dataset}": progress}, visible=True), # ) # offset += CHUNK_SIZE # logging.info("Finished processing all data") # yield ( # topics_info, # topic_plot, # gr.Label({f"✅ Generating topics {dataset}": 1.0}, visible=True), # ) # cuda.empty_cache() # with gr.Blocks() as demo: # gr.Markdown("# 💠 Dataset Topic Discovery 🔭") # gr.Markdown("## Select dataset and text column") # with gr.Accordion("Data details", open=True): # with gr.Row(): # with gr.Column(scale=3): # dataset_name = HuggingfaceHubSearch( # label="Hub Dataset ID", # placeholder="Search for dataset id on Huggingface", # search_type="dataset", # ) # subset_dropdown = gr.Dropdown(label="Subset", visible=False) # split_dropdown = gr.Dropdown(label="Split", visible=False) # with gr.Accordion("Dataset preview", open=False): # @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown]) # def embed(name, subset, split): # html_code = f""" # # """ # return gr.HTML(value=html_code) # with gr.Row(): # text_column_dropdown = gr.Dropdown(label="Text column name") # nested_text_column_dropdown = gr.Dropdown( # label="Nested text column name", visible=False # ) # generate_button = gr.Button("Generate Topics", variant="primary") # gr.Markdown("## Datamap") # full_topics_generation_label = gr.Label(visible=False, show_label=False) # topics_plot = gr.Plot() # with gr.Accordion("Topics Info", open=False): # topics_df = gr.DataFrame(interactive=False, visible=True) # generate_button.click( # generate_topics, # inputs=[ # dataset_name, # subset_dropdown, # split_dropdown, # text_column_dropdown, # nested_text_column_dropdown, # ], # outputs=[topics_df, topics_plot, full_topics_generation_label], # ) # def _resolve_dataset_selection( # dataset: str, default_subset: str, default_split: str, text_feature # ): # if "/" not in dataset.strip().strip("/"): # return { # subset_dropdown: gr.Dropdown(visible=False), # split_dropdown: gr.Dropdown(visible=False), # text_column_dropdown: gr.Dropdown(label="Text column name"), # nested_text_column_dropdown: gr.Dropdown(visible=False), # } # info_resp = session.get( # f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20 # ).json() # if "error" in info_resp: # return { # subset_dropdown: gr.Dropdown(visible=False), # split_dropdown: gr.Dropdown(visible=False), # text_column_dropdown: gr.Dropdown(label="Text column name"), # nested_text_column_dropdown: gr.Dropdown(visible=False), # } # subsets: list[str] = list(info_resp["dataset_info"]) # subset = default_subset if default_subset in subsets else subsets[0] # splits: list[str] = list(info_resp["dataset_info"][subset]["splits"]) # split = default_split if default_split in splits else splits[0] # features = info_resp["dataset_info"][subset]["features"] # def _is_string_feature(feature): # return isinstance(feature, dict) and feature.get("dtype") == "string" # text_features = [ # feature_name # for feature_name, feature in features.items() # if _is_string_feature(feature) # ] # nested_features = [ # feature_name # for feature_name, feature in features.items() # if isinstance(feature, dict) # and isinstance(next(iter(feature.values())), dict) # ] # nested_text_features = [ # feature_name # for feature_name in nested_features # if any( # _is_string_feature(nested_feature) # for nested_feature in features[feature_name].values() # ) # ] # if not text_feature: # return { # subset_dropdown: gr.Dropdown( # value=subset, choices=subsets, visible=len(subsets) > 1 # ), # split_dropdown: gr.Dropdown( # value=split, choices=splits, visible=len(splits) > 1 # ), # text_column_dropdown: gr.Dropdown( # choices=text_features + nested_text_features, # label="Text column name", # ), # nested_text_column_dropdown: gr.Dropdown(visible=False), # } # if text_feature in nested_text_features: # nested_keys = [ # feature_name # for feature_name, feature in features[text_feature].items() # if _is_string_feature(feature) # ] # return { # subset_dropdown: gr.Dropdown( # value=subset, choices=subsets, visible=len(subsets) > 1 # ), # split_dropdown: gr.Dropdown( # value=split, choices=splits, visible=len(splits) > 1 # ), # text_column_dropdown: gr.Dropdown( # choices=text_features + nested_text_features, # label="Text column name", # ), # nested_text_column_dropdown: gr.Dropdown( # value=nested_keys[0], # choices=nested_keys, # label="Nested text column name", # visible=True, # ), # } # return { # subset_dropdown: gr.Dropdown( # value=subset, choices=subsets, visible=len(subsets) > 1 # ), # split_dropdown: gr.Dropdown( # value=split, choices=splits, visible=len(splits) > 1 # ), # text_column_dropdown: gr.Dropdown( # choices=text_features + nested_text_features, label="Text column name" # ), # nested_text_column_dropdown: gr.Dropdown(visible=False), # } # @dataset_name.change( # inputs=[dataset_name], # outputs=[ # subset_dropdown, # split_dropdown, # text_column_dropdown, # nested_text_column_dropdown, # ], # ) # def show_input_from_subset_dropdown(dataset: str) -> dict: # return _resolve_dataset_selection( # dataset, default_subset="default", default_split="train", text_feature=None # ) # @subset_dropdown.change( # inputs=[dataset_name, subset_dropdown], # outputs=[ # subset_dropdown, # split_dropdown, # text_column_dropdown, # nested_text_column_dropdown, # ], # ) # def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict: # return _resolve_dataset_selection( # dataset, default_subset=subset, default_split="train", text_feature=None # ) # @split_dropdown.change( # inputs=[dataset_name, subset_dropdown, split_dropdown], # outputs=[ # subset_dropdown, # split_dropdown, # text_column_dropdown, # nested_text_column_dropdown, # ], # ) # def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict: # return _resolve_dataset_selection( # dataset, default_subset=subset, default_split=split, text_feature=None # ) # @text_column_dropdown.change( # inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown], # outputs=[ # subset_dropdown, # split_dropdown, # text_column_dropdown, # nested_text_column_dropdown, # ], # ) # def show_input_from_text_column_dropdown( # dataset: str, subset: str, split: str, text_column # ) -> dict: # return _resolve_dataset_selection( # dataset, # default_subset=subset, # default_split=split, # text_feature=text_column, # ) # demo.launch() import gradio as gr # Full HTML content html_content = """

Welcome to My Gradio App

This is a paragraph with bold and italic text.

Sample Image """ # Create a Gradio interface with gr.Blocks() as demo: gr.HTML(html_content) # Launch the app demo.launch()