diff --git "a/main.ipynb" "b/main.ipynb" new file mode 100644--- /dev/null +++ "b/main.ipynb" @@ -0,0 +1,4401 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "522yaTM-fURe" + }, + "source": [ + "Structure\n", + "\n", + "1. Data: (dataset creation)/downloading existing dataset -> reading data\n", + "2. Top2Vec model training\n", + "3. Working with Top2vec features\n", + "4. Visualization\n", + "5. Gradio interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-t7--rlsklAt" + }, + "outputs": [], + "source": [ + "#@title Installing necessary dependencies\n", + "%%capture\n", + "!pip install arxivscraper\n", + "!pip install top2vec\n", + "!pip install top2vec[sentence_encoders]\n", + "!pip install tensorflow==2.8.0\n", + "!pip install tensorflow-probability==0.16.0\n", + "!pip install gradio" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fwXn6Dd2SZgs" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jrufBESmlBEL" + }, + "source": [ + "Below are 2 options:\n", + "1. Create new dataset\n", + "2. Use an existing dataset that contains arXiv articles from th**e Computer Science** (CS) category, spanning from **2010 to 2023**.necessary dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "iMJmB6JvJjXy" + }, + "outputs": [], + "source": [ + "#@title Create dataset\n", + "\n", + "### All commented but you can uncomment if you want to create a dataset\n", + "\n", + "# # Extracting and Processing arXiv Data\n", + "# import arxivscraper\n", + "# import pandas as pd\n", + "\n", + "# def scrape_and_save(category, start_year, end_year):\n", + "# \"\"\"Scrape arXiv data for a given category and range of years.\"\"\"\n", + "# for year in range(start_year, end_year):\n", + "# scraper = arxivscraper.Scraper(category=category,\n", + "# date_from=f'{year}-01-01',\n", + "# date_until=f'{year+1}-01-01')\n", + "# df = pd.DataFrame(scraper.scrape())\n", + "# df.to_csv(f'arxiv_{category}_{year}.csv', index=False)\n", + "# print(f'Data for {year} saved.')\n", + "\n", + "# def combine_and_process(file_names):\n", + "# \"\"\"Combine multiple CSV files into a single DataFrame and process the data.\"\"\"\n", + "# df_list = []\n", + "# for file_name in file_names:\n", + "# df_temp = pd.read_csv(file_name, dtype={'id': str}, low_memory=False)\n", + "# # Convert text columns to strings\n", + "# text_columns = ['title', 'abstract', 'categories', 'doi', 'authors', 'url']\n", + "# df_temp[text_columns] = df_temp[text_columns].astype(str)\n", + "# # Convert date columns to datetime, with invalid dates set as NaT\n", + "# date_columns = ['created', 'updated']\n", + "# df_temp[date_columns] = pd.to_datetime(df_temp[date_columns], errors='coerce')\n", + "# df_list.append(df_temp)\n", + "# # Combine all DataFrames into one\n", + "# df_combined = pd.concat(df_list, ignore_index=True)\n", + "# # Convert NaNs to 'None' for text columns\n", + "# df_combined[text_columns] = df_combined[text_columns].fillna('None')\n", + "# return df_combined\n", + "\n", + "# # Scrape data for the 'cs' category from 2010 to 2023\n", + "# scrape_and_save(category='cs', start_year=2010, end_year=2024)\n", + "\n", + "# # Combine and process the scraped data\n", + "# file_names = [f'arxiv_cs_{year}.csv' for year in range(2010, 2024)]\n", + "# df_combined = combine_and_process(file_names)\n", + "\n", + "# # Save the combined DataFrame as a Parquet file\n", + "# try:\n", + "# df_combined.to_parquet('combined_data.parquet', index=False)\n", + "# print(\"File successfully saved in Parquet format.\")\n", + "# except Exception as e:\n", + "# print(f\"Error saving file: {e}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "rlMyF401mWuH" + }, + "outputs": [], + "source": [ + "#@title Downloading existing dataset\n", + "%%capture\n", + "!wget https://huggingface.co/datasets/CCRss/arxiv_papers_cs/resolve/main/arxiv_cs_from2010to2024-01-01.parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "57Xc92VzmQVU", + "outputId": "0fadd68f-dff1-4e39-8ce3-f03a77bcfa1c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(555563, 11)\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleidabstractcategoriesdoicreatedupdatedauthorsurlabstract_lengthid_n
0on-line viterbi algorithm and its relationship...0704.0062in this paper, we introduce the on-line viterb...cs.ds10.1007/978-3-540-74126-8_232007-03-31NaT['šrámek', 'brejová', 'vinař']https://arxiv.org/abs/0704.00627110
1capacity of a multiple-antenna fading channel ...0704.0217given a multiple-input multiple-output (mimo) ...cs.it math.it10.1109/tit.2008.20114372007-04-022009-02-16['santipach', 'honig']https://arxiv.org/abs/0704.021716411
2refuting the pseudo attack on the reesse1+ cry...0704.0492we illustrate through example 1 and 2 that the...cs.crnan2007-04-042010-02-04['su', 'lu']https://arxiv.org/abs/0704.049213452
3optimal routing for decode-and-forward based c...0704.0499we investigate cooperative wireless relay netw...cs.it math.it10.1109/sahcn.2007.42928452007-04-04NaT['ong', 'motani']https://arxiv.org/abs/0704.049911643
4on the kolmogorov-chaitin complexity for short...0704.1043a drawback of kolmogorov-chaitin complexity (k...cs.cc cs.it math.itnan2007-04-082010-12-16['delahaye', 'zenil']https://arxiv.org/abs/0704.10438614
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " title id \\\n", + "0 on-line viterbi algorithm and its relationship... 0704.0062 \n", + "1 capacity of a multiple-antenna fading channel ... 0704.0217 \n", + "2 refuting the pseudo attack on the reesse1+ cry... 0704.0492 \n", + "3 optimal routing for decode-and-forward based c... 0704.0499 \n", + "4 on the kolmogorov-chaitin complexity for short... 0704.1043 \n", + "\n", + " abstract categories \\\n", + "0 in this paper, we introduce the on-line viterb... cs.ds \n", + "1 given a multiple-input multiple-output (mimo) ... cs.it math.it \n", + "2 we illustrate through example 1 and 2 that the... cs.cr \n", + "3 we investigate cooperative wireless relay netw... cs.it math.it \n", + "4 a drawback of kolmogorov-chaitin complexity (k... cs.cc cs.it math.it \n", + "\n", + " doi created updated \\\n", + "0 10.1007/978-3-540-74126-8_23 2007-03-31 NaT \n", + "1 10.1109/tit.2008.2011437 2007-04-02 2009-02-16 \n", + "2 nan 2007-04-04 2010-02-04 \n", + "3 10.1109/sahcn.2007.4292845 2007-04-04 NaT \n", + "4 nan 2007-04-08 2010-12-16 \n", + "\n", + " authors url \\\n", + "0 ['šrámek', 'brejová', 'vinař'] https://arxiv.org/abs/0704.0062 \n", + "1 ['santipach', 'honig'] https://arxiv.org/abs/0704.0217 \n", + "2 ['su', 'lu'] https://arxiv.org/abs/0704.0492 \n", + "3 ['ong', 'motani'] https://arxiv.org/abs/0704.0499 \n", + "4 ['delahaye', 'zenil'] https://arxiv.org/abs/0704.1043 \n", + "\n", + " abstract_length id_n \n", + "0 711 0 \n", + "1 1641 1 \n", + "2 1345 2 \n", + "3 1164 3 \n", + "4 861 4 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#@title Read data\n", + "import pandas as pd\n", + "\n", + "file_name = '/content/arxiv_cs_from2010to2024-01-01.parquet' # specifying path\n", + "df = pd.read_parquet(file_name)\n", + "\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "__snAaRtVVQI" + }, + "source": [ + "##OpenAI api TODO\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4q_FgRlVVc49" + }, + "source": [ + "https://platform.openai.com/docs/guides/text-generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fV7Bs3FQmR9N" + }, + "outputs": [], + "source": [ + "# %%capture\n", + "# !pip install openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C0gJOBQPxVyf" + }, + "outputs": [], + "source": [ + "# # Read the CSV file into a DataFrame\n", + "# df_open = pd.read_csv('/content/UAVs&ecology_keywords - Sheet1.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a8STl_1dTNjM" + }, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# from openai import OpenAI\n", + "\n", + "# # OpenAI API setup\n", + "# client = OpenAI(api_key=' ')\n", + "# #TODO make instruction more strict\n", + "# instruction_for_topics = \"\"\"\n", + "# Instruction for Generating Topic Keywords:\n", + "# Analyze the provided keywords from documents within the topic.\n", + "# Identify the most representative and recurring keywords.\n", + "# Exclude any variations of 'UAV', 'unmanned aerial vehicle', or 'drones'.\n", + "# For multi-word keywords, split them into single words that are commonly recognized in the field.\n", + "# Select the top five keywords that best capture the essence of the topic, ensuring they are single words.\n", + "# Format Keywords: List the selected keywords in this format: [\"keyword1\", \"keyword2\", \"keyword3\", \"keyword4\", \"keyword5\"].\n", + "# Provide Keywords Only: Respond with the formatted list of keywords, without any additional sentences or explanations.\n", + "# \"\"\"\n", + "# # Generate keywords for each topic\n", + "# topic_keywords = {}\n", + "# for topic in df_open['topic'].unique():\n", + "# combined_keywords = '; '.join(df_open[df_open['topic'] == topic]['keywords'])\n", + "# completion = client.chat.completions.create(\n", + "# model=\"gpt-3.5-turbo\",\n", + "# seed=5,\n", + "# temperature=0.1,\n", + "# max_tokens=100,\n", + "# messages=[\n", + "# {\n", + "# \"role\": \"user\",\n", + "# \"content\": instruction_for_topics + combined_keywords,\n", + "# },\n", + "# ],\n", + "# )\n", + "# topic_keywords[topic] = completion.choices[0].message.content\n", + "\n", + "# print(topic_keywords)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qol5Fif4WCqQ" + }, + "outputs": [], + "source": [ + "# for topic, keywords in topic_keywords.items():\n", + "# df_open.loc[df_open['topic'] == topic, 'generated_keywords'] = keywords\n", + "\n", + "# # Now, 'df' will have a new column 'generated_keywords' with the generated keywords for each topic\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9CtxAZWNr_tu" + }, + "outputs": [], + "source": [ + "# df_open.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PymI90pNuW7Q" + }, + "outputs": [], + "source": [ + "# df_open.to_csv('/content/UAVs&ecology_keywords_updated.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7r4Bz9Zl2XNG" + }, + "source": [ + "## Top2Vec" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kxEOZR29sOv2" + }, + "source": [ + "### Model training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Disi6TvezLHD" + }, + "outputs": [], + "source": [ + "# ### All commented because model already trained and we can download it from huggingface\n", + "# #@title Specifying model parameters\n", + "# from top2vec import Top2Vec\n", + "# # Create a list of strings to pass it in Top2Vec\n", + "# docs = df.abstract.tolist()\n", + "\n", + "# model = Top2Vec(\n", + "# documents=docs,\n", + "# speed='learn',\n", + "# workers=80,\n", + "# embedding_model='universal-sentence-encoder',\n", + "# umap_args={'n_neighbors': 15,\n", + "# 'n_components': 5,\n", + "# 'metric': 'cosine',\n", + "# 'min_dist': 0.0,\n", + "# 'random_state': 42},\n", + "# hdbscan_args={'min_cluster_size': 15,\n", + "# 'metric': 'euclidean',\n", + "# 'cluster_selection_method': 'eom'}\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dFqiiLgdq-hO" + }, + "outputs": [], + "source": [ + "#@title Save model\n", + "# model.save('arxiv_cs_from2010to2024-01-01')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F18TmBKfsm2E" + }, + "source": [ + "### Model initialization and assigning Document Topics\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BPsNKwIe7JUs" + }, + "outputs": [], + "source": [ + "#@title Downloading trained top2vec model from Hugging Face\n", + "%%capture\n", + "!wget https://huggingface.co/CCRss/topic_modeling_top2vec_scientific-texts/resolve/main/top2vec_model_arxiv_cs_from2010to2024-01-01" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VNP_coPQss4s" + }, + "outputs": [], + "source": [ + "#@title Load model\n", + "from top2vec import Top2Vec\n", + "model = Top2Vec.load(\"/content/top2vec_model_arxiv_cs_from2010to2024-01-01\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j4p5L6XfZmaN" + }, + "outputs": [], + "source": [ + "#@title Assigning Document Topics and Creating a Sorted DataFrame\n", + "# Get topic sizes and numbers\n", + "topic_sizes, topic_nums = model.get_topic_sizes()\n", + "\n", + "# Initialize an empty list for results\n", + "data = []\n", + "\n", + "# Iterate over each topic\n", + "for topic_num in topic_nums:\n", + " # Get documents belonging to the topic\n", + " _, _, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=topic_sizes[topic_num])\n", + "\n", + " # Add document IDs and topic number to the list\n", + " for doc_id in document_ids:\n", + " data.append({'document_id': doc_id, 'topic_num': topic_num})\n", + "\n", + "# Create a DataFrame from the list\n", + "df_new = pd.DataFrame(data)\n", + "\n", + "# Sort the new DataFrame by document_id\n", + "df_new = df_new.sort_values(by='document_id').reset_index(drop=True)\n", + "\n", + "# Assign topic numbers to the original DataFrame\n", + "df['topic_num'] = df_new['topic_num']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "dwGyOrnhn2DQ" + }, + "outputs": [], + "source": [ + "#@title Get topic representations.\n", + "topic_words, word_scores, topic_nums = model.get_topics()\n", + "\n", + "# Function to create a topic representation string\n", + "def create_topic_representation(words, scores):\n", + " return ', '.join(words[:5]) # Join the first 5 words with commas\n", + "\n", + "# Create a list of topic representations\n", + "topic_representations = [create_topic_representation(words, scores) for words, scores in zip(topic_words, word_scores)]\n", + "\n", + "# Convert the list to a pandas Series\n", + "topic_representation_dict = dict(zip(topic_nums, topic_representations))\n", + "\n", + "# Map topic numbers to representations in the DataFrame\n", + "df['topic_representation'] = df['topic_num'].map(topic_representation_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "165rLE-go0XV" + }, + "source": [ + "### Identification and evaluation of thematic groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O6c8Xphm56QE", + "outputId": "da42451a-ad42-4141-88ed-f18b8c887f5e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The amount of docs in every topic:\n", + "------------------------------\n", + "Topic 0: 40891 docs\n", + "Topic 1: 9478 docs\n", + "Topic 2: 9250 docs\n", + "Topic 3: 9052 docs\n", + "Topic 4: 7547 docs\n", + "Topic 5: 7365 docs\n", + "Topic 6: 6434 docs\n", + "Topic 7: 5701 docs\n", + "Topic 8: 5652 docs\n", + "Topic 9: 5367 docs\n", + "Topic 10: 5304 docs\n", + "Topic 11: 5127 docs\n", + "Topic 12: 4999 docs\n", + "Topic 13: 4830 docs\n", + "Topic 14: 4782 docs\n", + "Topic 15: 4269 docs\n", + "Topic 16: 3997 docs\n", + "Topic 17: 3994 docs\n", + "Topic 18: 3866 docs\n", + "Topic 19: 3817 docs\n", + "Topic 20: 3693 docs\n", + "Topic 21: 3637 docs\n", + "Topic 22: 3537 docs\n", + "Topic 23: 3430 docs\n", + "Topic 24: 3365 docs\n", + "Topic 25: 3150 docs\n", + "Topic 26: 3133 docs\n", + "Topic 27: 3115 docs\n", + "Topic 28: 3033 docs\n", + "Topic 29: 3026 docs\n", + "Topic 30: 3006 docs\n", + "Topic 31: 2990 docs\n", + "Topic 32: 2969 docs\n", + "Topic 33: 2939 docs\n", + "Topic 34: 2916 docs\n", + "Topic 35: 2879 docs\n", + "Topic 36: 2745 docs\n", + "Topic 37: 2677 docs\n", + "Topic 38: 2660 docs\n", + "Topic 39: 2645 docs\n", + "Topic 40: 2627 docs\n", + "Topic 41: 2591 docs\n", + "Topic 42: 2546 docs\n", + "Topic 43: 2508 docs\n", + "Topic 44: 2482 docs\n", + "Topic 45: 2472 docs\n", + "Topic 46: 2452 docs\n", + "Topic 47: 2429 docs\n", + "Topic 48: 2359 docs\n", + "Topic 49: 2325 docs\n", + "Topic 50: 2321 docs\n", + "Topic 51: 2296 docs\n", + "Topic 52: 2278 docs\n", + "Topic 53: 2262 docs\n", + "Topic 54: 2253 docs\n", + "Topic 55: 2217 docs\n", + "Topic 56: 2217 docs\n", + "Topic 57: 2191 docs\n", + "Topic 58: 2166 docs\n", + "Topic 59: 2153 docs\n", + "Topic 60: 2020 docs\n", + "Topic 61: 1980 docs\n", + "Topic 62: 1939 docs\n", + "Topic 63: 1900 docs\n", + "Topic 64: 1888 docs\n", + "Topic 65: 1859 docs\n", + "Topic 66: 1851 docs\n", + "Topic 67: 1832 docs\n", + "Topic 68: 1757 docs\n", + "Topic 69: 1746 docs\n", + "Topic 70: 1726 docs\n", + "Topic 71: 1726 docs\n", + "Topic 72: 1722 docs\n", + "Topic 73: 1691 docs\n", + "Topic 74: 1679 docs\n", + "Topic 75: 1657 docs\n", + "Topic 76: 1628 docs\n", + "Topic 77: 1591 docs\n", + "Topic 78: 1584 docs\n", + "Topic 79: 1563 docs\n", + "Topic 80: 1554 docs\n", + "Topic 81: 1549 docs\n", + "Topic 82: 1533 docs\n", + "Topic 83: 1530 docs\n", + "Topic 84: 1527 docs\n", + "Topic 85: 1495 docs\n", + "Topic 86: 1464 docs\n", + "Topic 87: 1436 docs\n", + "Topic 88: 1428 docs\n", + "Topic 89: 1414 docs\n", + "Topic 90: 1404 docs\n", + "Topic 91: 1378 docs\n", + "Topic 92: 1378 docs\n", + "Topic 93: 1361 docs\n", + "Topic 94: 1355 docs\n", + "Topic 95: 1344 docs\n", + "Topic 96: 1342 docs\n", + "Topic 97: 1335 docs\n", + "Topic 98: 1329 docs\n", + "Topic 99: 1299 docs\n", + "Topic 100: 1297 docs\n", + "Topic 101: 1279 docs\n", + "Topic 102: 1261 docs\n", + "Topic 103: 1252 docs\n", + "Topic 104: 1251 docs\n", + "Topic 105: 1251 docs\n", + "Topic 106: 1241 docs\n", + "Topic 107: 1203 docs\n", + "Topic 108: 1202 docs\n", + "Topic 109: 1197 docs\n", + "Topic 110: 1186 docs\n", + "Topic 111: 1177 docs\n", + "Topic 112: 1171 docs\n", + "Topic 113: 1162 docs\n", + "Topic 114: 1154 docs\n", + "Topic 115: 1139 docs\n", + "Topic 116: 1135 docs\n", + "Topic 117: 1129 docs\n", + "Topic 118: 1117 docs\n", + "Topic 119: 1115 docs\n", + "Topic 120: 1111 docs\n", + "Topic 121: 1106 docs\n", + "Topic 122: 1098 docs\n", + "Topic 123: 1096 docs\n", + "Topic 124: 1094 docs\n", + "Topic 125: 1090 docs\n", + "Topic 126: 1083 docs\n", + "Topic 127: 1082 docs\n", + "Topic 128: 1070 docs\n", + "Topic 129: 1068 docs\n", + "Topic 130: 1067 docs\n", + "Topic 131: 1065 docs\n", + "Topic 132: 1061 docs\n", + "Topic 133: 1054 docs\n", + "Topic 134: 1051 docs\n", + "Topic 135: 1044 docs\n", + "Topic 136: 1033 docs\n", + "Topic 137: 1014 docs\n", + "Topic 138: 1010 docs\n", + "Topic 139: 1006 docs\n", + "Topic 140: 997 docs\n", + "Topic 141: 994 docs\n", + "Topic 142: 988 docs\n", + "Topic 143: 987 docs\n", + "Topic 144: 986 docs\n", + "Topic 145: 981 docs\n", + "Topic 146: 967 docs\n", + "Topic 147: 961 docs\n", + "Topic 148: 951 docs\n", + "Topic 149: 932 docs\n", + "Topic 150: 918 docs\n", + "Topic 151: 917 docs\n", + "Topic 152: 916 docs\n", + "Topic 153: 916 docs\n", + "Topic 154: 915 docs\n", + "Topic 155: 912 docs\n", + "Topic 156: 889 docs\n", + "Topic 157: 887 docs\n", + "Topic 158: 885 docs\n", + "Topic 159: 880 docs\n", + "Topic 160: 871 docs\n", + "Topic 161: 866 docs\n", + "Topic 162: 865 docs\n", + "Topic 163: 862 docs\n", + "Topic 164: 862 docs\n", + "Topic 165: 847 docs\n", + "Topic 166: 845 docs\n", + "Topic 167: 844 docs\n", + "Topic 168: 837 docs\n", + "Topic 169: 836 docs\n", + "Topic 170: 830 docs\n", + "Topic 171: 825 docs\n", + "Topic 172: 818 docs\n", + "Topic 173: 817 docs\n", + "Topic 174: 809 docs\n", + "Topic 175: 808 docs\n", + "Topic 176: 794 docs\n", + "Topic 177: 793 docs\n", + "Topic 178: 772 docs\n", + "Topic 179: 771 docs\n", + "Topic 180: 767 docs\n", + "Topic 181: 763 docs\n", + "Topic 182: 758 docs\n", + "Topic 183: 754 docs\n", + "Topic 184: 747 docs\n", + "Topic 185: 746 docs\n", + "Topic 186: 741 docs\n", + "Topic 187: 740 docs\n", + "Topic 188: 739 docs\n", + "Topic 189: 738 docs\n", + "Topic 190: 735 docs\n", + "Topic 191: 733 docs\n", + "Topic 192: 724 docs\n", + "Topic 193: 722 docs\n", + "Topic 194: 718 docs\n", + "Topic 195: 718 docs\n", + "Topic 196: 716 docs\n", + "Topic 197: 716 docs\n", + "Topic 198: 705 docs\n", + "Topic 199: 704 docs\n", + "Topic 200: 704 docs\n", + "Topic 201: 702 docs\n", + "Topic 202: 700 docs\n", + "Topic 203: 693 docs\n", + "Topic 204: 680 docs\n", + "Topic 205: 671 docs\n", + "Topic 206: 668 docs\n", + "Topic 207: 661 docs\n", + "Topic 208: 661 docs\n", + "Topic 209: 660 docs\n", + "Topic 210: 648 docs\n", + "Topic 211: 645 docs\n", + "Topic 212: 642 docs\n", + "Topic 213: 642 docs\n", + "Topic 214: 642 docs\n", + "Topic 215: 641 docs\n", + "Topic 216: 640 docs\n", + "Topic 217: 639 docs\n", + "Topic 218: 635 docs\n", + "Topic 219: 632 docs\n", + "Topic 220: 630 docs\n", + "Topic 221: 628 docs\n", + "Topic 222: 622 docs\n", + "Topic 223: 622 docs\n", + "Topic 224: 620 docs\n", + "Topic 225: 617 docs\n", + "Topic 226: 616 docs\n", + "Topic 227: 607 docs\n", + "Topic 228: 601 docs\n", + "Topic 229: 600 docs\n", + "Topic 230: 600 docs\n", + "Topic 231: 599 docs\n", + "Topic 232: 595 docs\n", + "Topic 233: 595 docs\n", + "Topic 234: 595 docs\n", + "Topic 235: 593 docs\n", + "Topic 236: 592 docs\n", + "Topic 237: 589 docs\n", + "Topic 238: 587 docs\n", + "Topic 239: 586 docs\n", + "Topic 240: 586 docs\n", + "Topic 241: 583 docs\n", + "Topic 242: 578 docs\n", + "Topic 243: 575 docs\n", + "Topic 244: 572 docs\n", + "Topic 245: 571 docs\n", + "Topic 246: 570 docs\n", + "Topic 247: 569 docs\n", + "Topic 248: 564 docs\n", + "Topic 249: 560 docs\n", + "Topic 250: 556 docs\n", + "Topic 251: 555 docs\n", + "Topic 252: 553 docs\n", + "Topic 253: 552 docs\n", + "Topic 254: 551 docs\n", + "Topic 255: 550 docs\n", + "Topic 256: 549 docs\n", + "Topic 257: 547 docs\n", + "Topic 258: 547 docs\n", + "Topic 259: 544 docs\n", + "Topic 260: 541 docs\n", + "Topic 261: 534 docs\n", + "Topic 262: 530 docs\n", + "Topic 263: 530 docs\n", + "Topic 264: 527 docs\n", + "Topic 265: 521 docs\n", + "Topic 266: 518 docs\n", + "Topic 267: 517 docs\n", + "Topic 268: 515 docs\n", + "Topic 269: 514 docs\n", + "Topic 270: 513 docs\n", + "Topic 271: 512 docs\n", + "Topic 272: 510 docs\n", + "Topic 273: 506 docs\n", + "Topic 274: 504 docs\n", + "Topic 275: 501 docs\n", + "Topic 276: 501 docs\n", + "Topic 277: 499 docs\n", + "Topic 278: 497 docs\n", + "Topic 279: 496 docs\n", + "Topic 280: 490 docs\n", + "Topic 281: 487 docs\n", + "Topic 282: 484 docs\n", + "Topic 283: 483 docs\n", + "Topic 284: 482 docs\n", + "Topic 285: 481 docs\n", + "Topic 286: 481 docs\n", + "Topic 287: 480 docs\n", + "Topic 288: 478 docs\n", + "Topic 289: 476 docs\n", + "Topic 290: 468 docs\n", + "Topic 291: 468 docs\n", + "Topic 292: 466 docs\n", + "Topic 293: 465 docs\n", + "Topic 294: 464 docs\n", + "Topic 295: 464 docs\n", + "Topic 296: 460 docs\n", + "Topic 297: 459 docs\n", + "Topic 298: 452 docs\n", + "Topic 299: 451 docs\n", + "Topic 300: 450 docs\n", + "Topic 301: 449 docs\n", + "Topic 302: 447 docs\n", + "Topic 303: 446 docs\n", + "Topic 304: 445 docs\n", + "Topic 305: 444 docs\n", + "Topic 306: 443 docs\n", + "Topic 307: 440 docs\n", + "Topic 308: 439 docs\n", + "Topic 309: 435 docs\n", + "Topic 310: 433 docs\n", + "Topic 311: 432 docs\n", + "Topic 312: 431 docs\n", + "Topic 313: 431 docs\n", + "Topic 314: 430 docs\n", + "Topic 315: 428 docs\n", + "Topic 316: 427 docs\n", + "Topic 317: 426 docs\n", + "Topic 318: 424 docs\n", + "Topic 319: 418 docs\n", + "Topic 320: 418 docs\n", + "Topic 321: 415 docs\n", + "Topic 322: 414 docs\n", + "Topic 323: 411 docs\n", + "Topic 324: 410 docs\n", + "Topic 325: 409 docs\n", + "Topic 326: 406 docs\n", + "Topic 327: 404 docs\n", + "Topic 328: 402 docs\n", + "Topic 329: 395 docs\n", + "Topic 330: 391 docs\n", + "Topic 331: 390 docs\n", + "Topic 332: 389 docs\n", + "Topic 333: 386 docs\n", + "Topic 334: 384 docs\n", + "Topic 335: 383 docs\n", + "Topic 336: 382 docs\n", + "Topic 337: 379 docs\n", + "Topic 338: 379 docs\n", + "Topic 339: 371 docs\n", + "Topic 340: 370 docs\n", + "Topic 341: 370 docs\n", + "Topic 342: 367 docs\n", + "Topic 343: 363 docs\n", + "Topic 344: 363 docs\n", + "Topic 345: 360 docs\n", + "Topic 346: 355 docs\n", + "Topic 347: 354 docs\n", + "Topic 348: 352 docs\n", + "Topic 349: 351 docs\n", + "Topic 350: 350 docs\n", + "Topic 351: 349 docs\n", + "Topic 352: 349 docs\n", + "Topic 353: 347 docs\n", + "Topic 354: 347 docs\n", + "Topic 355: 346 docs\n", + "Topic 356: 342 docs\n", + "Topic 357: 341 docs\n", + "Topic 358: 338 docs\n", + "Topic 359: 337 docs\n", + "Topic 360: 334 docs\n", + "Topic 361: 332 docs\n", + "Topic 362: 332 docs\n", + "Topic 363: 331 docs\n", + "Topic 364: 328 docs\n", + "Topic 365: 327 docs\n", + "Topic 366: 326 docs\n", + "Topic 367: 325 docs\n", + "Topic 368: 323 docs\n", + "Topic 369: 322 docs\n", + "Topic 370: 322 docs\n", + "Topic 371: 321 docs\n", + "Topic 372: 321 docs\n", + "Topic 373: 318 docs\n", + "Topic 374: 317 docs\n", + "Topic 375: 316 docs\n", + "Topic 376: 316 docs\n", + "Topic 377: 315 docs\n", + "Topic 378: 313 docs\n", + "Topic 379: 312 docs\n", + "Topic 380: 310 docs\n", + "Topic 381: 308 docs\n", + "Topic 382: 307 docs\n", + "Topic 383: 306 docs\n", + "Topic 384: 302 docs\n", + "Topic 385: 301 docs\n", + "Topic 386: 300 docs\n", + "Topic 387: 299 docs\n", + "Topic 388: 296 docs\n", + "Topic 389: 296 docs\n", + "Topic 390: 296 docs\n", + "Topic 391: 294 docs\n", + "Topic 392: 293 docs\n", + "Topic 393: 291 docs\n", + "Topic 394: 288 docs\n", + "Topic 395: 287 docs\n", + "Topic 396: 286 docs\n", + "Topic 397: 284 docs\n", + "Topic 398: 284 docs\n", + "Topic 399: 282 docs\n", + "Topic 400: 280 docs\n", + "Topic 401: 279 docs\n", + "Topic 402: 278 docs\n", + "Topic 403: 278 docs\n", + "Topic 404: 277 docs\n", + "Topic 405: 277 docs\n", + "Topic 406: 276 docs\n", + "Topic 407: 276 docs\n", + "Topic 408: 273 docs\n", + "Topic 409: 273 docs\n", + "Topic 410: 272 docs\n", + "Topic 411: 272 docs\n", + "Topic 412: 271 docs\n", + "Topic 413: 270 docs\n", + "Topic 414: 270 docs\n", + "Topic 415: 264 docs\n", + "Topic 416: 263 docs\n", + "Topic 417: 263 docs\n", + "Topic 418: 262 docs\n", + "Topic 419: 262 docs\n", + "Topic 420: 261 docs\n", + "Topic 421: 257 docs\n", + "Topic 422: 257 docs\n", + "Topic 423: 255 docs\n", + "Topic 424: 253 docs\n", + "Topic 425: 252 docs\n", + "Topic 426: 250 docs\n", + "Topic 427: 247 docs\n", + "Topic 428: 246 docs\n", + "Topic 429: 245 docs\n", + "Topic 430: 244 docs\n", + "Topic 431: 244 docs\n", + "Topic 432: 243 docs\n", + "Topic 433: 242 docs\n", + "Topic 434: 242 docs\n", + "Topic 435: 240 docs\n", + "Topic 436: 239 docs\n", + "Topic 437: 239 docs\n", + "Topic 438: 239 docs\n", + "Topic 439: 239 docs\n", + "Topic 440: 238 docs\n", + "Topic 441: 238 docs\n", + "Topic 442: 235 docs\n", + "Topic 443: 230 docs\n", + "Topic 444: 228 docs\n", + "Topic 445: 228 docs\n", + "Topic 446: 227 docs\n", + "Topic 447: 226 docs\n", + "Topic 448: 226 docs\n", + "Topic 449: 226 docs\n", + "Topic 450: 226 docs\n", + "Topic 451: 226 docs\n", + "Topic 452: 224 docs\n", + "Topic 453: 224 docs\n", + "Topic 454: 224 docs\n", + "Topic 455: 222 docs\n", + "Topic 456: 221 docs\n", + "Topic 457: 219 docs\n", + "Topic 458: 219 docs\n", + "Topic 459: 218 docs\n", + "Topic 460: 213 docs\n", + "Topic 461: 212 docs\n", + "Topic 462: 212 docs\n", + "Topic 463: 210 docs\n", + "Topic 464: 208 docs\n", + "Topic 465: 208 docs\n", + "Topic 466: 207 docs\n", + "Topic 467: 206 docs\n", + "Topic 468: 206 docs\n", + "Topic 469: 205 docs\n", + "Topic 470: 205 docs\n", + "Topic 471: 204 docs\n", + "Topic 472: 202 docs\n", + "Topic 473: 201 docs\n", + "Topic 474: 201 docs\n", + "Topic 475: 200 docs\n", + "Topic 476: 200 docs\n", + "Topic 477: 198 docs\n", + "Topic 478: 197 docs\n", + "Topic 479: 197 docs\n", + "Topic 480: 195 docs\n", + "Topic 481: 194 docs\n", + "Topic 482: 193 docs\n", + "Topic 483: 192 docs\n", + "Topic 484: 192 docs\n", + "Topic 485: 192 docs\n", + "Topic 486: 191 docs\n", + "Topic 487: 191 docs\n", + "Topic 488: 190 docs\n", + "Topic 489: 189 docs\n", + "Topic 490: 188 docs\n", + "Topic 491: 187 docs\n", + "Topic 492: 187 docs\n", + "Topic 493: 186 docs\n", + "Topic 494: 184 docs\n", + "Topic 495: 184 docs\n", + "Topic 496: 184 docs\n", + "Topic 497: 183 docs\n", + "Topic 498: 183 docs\n", + "Topic 499: 182 docs\n", + "Topic 500: 181 docs\n", + "Topic 501: 179 docs\n", + "Topic 502: 178 docs\n", + "Topic 503: 178 docs\n", + "Topic 504: 177 docs\n", + "Topic 505: 177 docs\n", + "Topic 506: 177 docs\n", + "Topic 507: 176 docs\n", + "Topic 508: 176 docs\n", + "Topic 509: 175 docs\n", + "Topic 510: 174 docs\n", + "Topic 511: 173 docs\n", + "Topic 512: 171 docs\n", + "Topic 513: 171 docs\n", + "Topic 514: 170 docs\n", + "Topic 515: 170 docs\n", + "Topic 516: 169 docs\n", + "Topic 517: 169 docs\n", + "Topic 518: 168 docs\n", + "Topic 519: 168 docs\n", + "Topic 520: 168 docs\n", + "Topic 521: 167 docs\n", + "Topic 522: 167 docs\n", + "Topic 523: 166 docs\n", + "Topic 524: 166 docs\n", + "Topic 525: 165 docs\n", + "Topic 526: 165 docs\n", + "Topic 527: 163 docs\n", + "Topic 528: 162 docs\n", + "Topic 529: 160 docs\n", + "Topic 530: 159 docs\n", + "Topic 531: 159 docs\n", + "Topic 532: 159 docs\n", + "Topic 533: 158 docs\n", + "Topic 534: 157 docs\n", + "Topic 535: 155 docs\n", + "Topic 536: 154 docs\n", + "Topic 537: 154 docs\n", + "Topic 538: 153 docs\n", + "Topic 539: 152 docs\n", + "Topic 540: 152 docs\n", + "Topic 541: 152 docs\n", + "Topic 542: 151 docs\n", + "Topic 543: 151 docs\n", + "Topic 544: 151 docs\n", + "Topic 545: 150 docs\n", + "Topic 546: 149 docs\n", + "Topic 547: 149 docs\n", + "Topic 548: 148 docs\n", + "Topic 549: 147 docs\n", + "Topic 550: 146 docs\n", + "Topic 551: 146 docs\n", + "Topic 552: 145 docs\n", + "Topic 553: 145 docs\n", + "Topic 554: 144 docs\n", + "Topic 555: 144 docs\n", + "Topic 556: 143 docs\n", + "Topic 557: 143 docs\n", + "Topic 558: 142 docs\n", + "Topic 559: 142 docs\n", + "Topic 560: 140 docs\n", + "Topic 561: 140 docs\n", + "Topic 562: 136 docs\n", + "Topic 563: 135 docs\n", + "Topic 564: 135 docs\n", + "Topic 565: 132 docs\n", + "Topic 566: 132 docs\n", + "Topic 567: 130 docs\n", + "Topic 568: 129 docs\n", + "Topic 569: 128 docs\n", + "Topic 570: 127 docs\n", + "Topic 571: 125 docs\n", + "Topic 572: 123 docs\n", + "Topic 573: 122 docs\n", + "Topic 574: 121 docs\n", + "Topic 575: 121 docs\n", + "Topic 576: 121 docs\n", + "Topic 577: 121 docs\n", + "Topic 578: 121 docs\n", + "Topic 579: 120 docs\n", + "Topic 580: 120 docs\n", + "Topic 581: 120 docs\n", + "Topic 582: 120 docs\n", + "Topic 583: 120 docs\n", + "Topic 584: 120 docs\n", + "Topic 585: 119 docs\n", + "Topic 586: 119 docs\n", + "Topic 587: 118 docs\n", + "Topic 588: 118 docs\n", + "Topic 589: 118 docs\n", + "Topic 590: 117 docs\n", + "Topic 591: 117 docs\n", + "Topic 592: 117 docs\n", + "Topic 593: 116 docs\n", + "Topic 594: 115 docs\n", + "Topic 595: 114 docs\n", + "Topic 596: 113 docs\n", + "Topic 597: 111 docs\n", + "Topic 598: 111 docs\n", + "Topic 599: 110 docs\n", + "Topic 600: 110 docs\n", + "Topic 601: 110 docs\n", + "Topic 602: 107 docs\n", + "Topic 603: 107 docs\n", + "Topic 604: 105 docs\n", + "Topic 605: 105 docs\n", + "Topic 606: 104 docs\n", + "Topic 607: 103 docs\n", + "Topic 608: 103 docs\n", + "Topic 609: 103 docs\n", + "Topic 610: 102 docs\n", + "Topic 611: 101 docs\n", + "Topic 612: 101 docs\n", + "Topic 613: 100 docs\n", + "Topic 614: 99 docs\n", + "Topic 615: 97 docs\n", + "Topic 616: 97 docs\n", + "Topic 617: 96 docs\n", + "Topic 618: 96 docs\n", + "Topic 619: 95 docs\n", + "Topic 620: 93 docs\n", + "Topic 621: 92 docs\n", + "Topic 622: 92 docs\n", + "Topic 623: 91 docs\n", + "Topic 624: 90 docs\n", + "Topic 625: 90 docs\n", + "Topic 626: 89 docs\n", + "Topic 627: 89 docs\n", + "Topic 628: 89 docs\n", + "Topic 629: 89 docs\n", + "Topic 630: 89 docs\n", + "Topic 631: 88 docs\n", + "Topic 632: 86 docs\n", + "Topic 633: 85 docs\n", + "Topic 634: 83 docs\n", + "Topic 635: 82 docs\n", + "Topic 636: 82 docs\n", + "Topic 637: 82 docs\n", + "Topic 638: 82 docs\n", + "Topic 639: 80 docs\n", + "Topic 640: 79 docs\n", + "Topic 641: 75 docs\n", + "Topic 642: 73 docs\n", + "Topic 643: 72 docs\n", + "Topic 644: 72 docs\n", + "Topic 645: 72 docs\n", + "Topic 646: 71 docs\n", + "Topic 647: 70 docs\n", + "Topic 648: 70 docs\n", + "Topic 649: 68 docs\n", + "Topic 650: 67 docs\n", + "Topic 651: 66 docs\n", + "Topic 652: 66 docs\n", + "Topic 653: 65 docs\n", + "Topic 654: 63 docs\n", + "Topic 655: 62 docs\n", + "Topic 656: 61 docs\n", + "Topic 657: 61 docs\n", + "Topic 658: 61 docs\n", + "Topic 659: 60 docs\n", + "Topic 660: 59 docs\n", + "Topic 661: 59 docs\n", + "Topic 662: 58 docs\n", + "Topic 663: 57 docs\n", + "Topic 664: 55 docs\n", + "Topic 665: 55 docs\n", + "Topic 666: 54 docs\n", + "Topic 667: 53 docs\n", + "Topic 668: 52 docs\n", + "Topic 669: 51 docs\n", + "Topic 670: 51 docs\n", + "Topic 671: 50 docs\n", + "Topic 672: 50 docs\n", + "Topic 673: 48 docs\n", + "Topic 674: 47 docs\n", + "Topic 675: 43 docs\n", + "Topic 676: 43 docs\n", + "Topic 677: 43 docs\n", + "Topic 678: 42 docs\n", + "Topic 679: 42 docs\n", + "Topic 680: 41 docs\n", + "Topic 681: 41 docs\n", + "Topic 682: 41 docs\n", + "Topic 683: 40 docs\n", + "Topic 684: 40 docs\n", + "Topic 685: 38 docs\n", + "Topic 686: 38 docs\n", + "Topic 687: 35 docs\n", + "Topic 688: 31 docs\n", + "Topic 689: 28 docs\n" + ] + } + ], + "source": [ + "#@title Check the amount of documents\n", + "topic_sizes, topic_nums = model.get_topic_sizes()\n", + "\n", + "\n", + "print(\"The amount of docs in every topic:\")\n", + "print(\"-\" * 30)\n", + "for num, size in zip(topic_nums, topic_sizes):\n", + " print(f\"Topic {num}: {size} docs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QhGigDetuYQL" + }, + "source": [ + "In the code below, we use the Top2Vec model to search for topics related to precision agriculture and UAVs. We specify keywords such as \"uav,\" \"precision,\" \"agriculture,\" \"crop,\" and others to find topics that are most relevant to these terms. The model then returns the top 20 topics that best match these keywords, along with their relevance scores, the top words associated with each topic, and the similarity scores of these words to the topic. The output includes details of each topic, such as the topic number, score, and the top words with their scores. This helps in understanding the focus of each topic and how it relates to the specified keywords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IwQ7RxogYHif", + "outputId": "4e8f0d7d-ccd4-42a7-ba9e-8aa67079e7b6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic #245 (Score: 0.18):\n", + "--------------------------------------------------\n", + "clustering: 0.28\n", + "crops: 0.28\n", + "harvesting: 0.26\n", + "crop: 0.26\n", + "predictors: 0.25\n", + "--------------------------------------------------\n", + "Topic #201 (Score: 0.17):\n", + "--------------------------------------------------\n", + "landsat: 0.37\n", + "clustering: 0.37\n", + "dataset: 0.34\n", + "datasets: 0.33\n", + "triangulation: 0.33\n", + "--------------------------------------------------\n", + "Topic #285 (Score: 0.16):\n", + "--------------------------------------------------\n", + "drones: 0.52\n", + "uavs: 0.49\n", + "drone: 0.46\n", + "quadcopters: 0.45\n", + "quadcopter: 0.44\n", + "--------------------------------------------------\n", + "Topic #497 (Score: 0.16):\n", + "--------------------------------------------------\n", + "deforestation: 0.33\n", + "forests: 0.32\n", + "landsat: 0.32\n", + "trees: 0.29\n", + "clustering: 0.27\n", + "--------------------------------------------------\n", + "Topic #286 (Score: 0.15):\n", + "--------------------------------------------------\n", + "triangulation: 0.40\n", + "voronoi: 0.37\n", + "geospatial: 0.34\n", + "roadways: 0.34\n", + "polyline: 0.33\n", + "--------------------------------------------------\n", + "Topic #514 (Score: 0.14):\n", + "--------------------------------------------------\n", + "agricultural: 0.31\n", + "agriculture: 0.28\n", + "crops: 0.27\n", + "farming: 0.27\n", + "farmers: 0.27\n", + "--------------------------------------------------\n", + "Topic #666 (Score: 0.14):\n", + "--------------------------------------------------\n", + "seed: 0.46\n", + "seeded: 0.44\n", + "seeds: 0.42\n", + "seeding: 0.38\n", + "randomized: 0.35\n", + "--------------------------------------------------\n", + "Topic #639 (Score: 0.13):\n", + "--------------------------------------------------\n", + "robbers: 0.35\n", + "graphs: 0.33\n", + "graph: 0.32\n", + "heuristic: 0.31\n", + "asymptotically: 0.31\n", + "--------------------------------------------------\n", + "Topic #371 (Score: 0.13):\n", + "--------------------------------------------------\n", + "classifiers: 0.30\n", + "clustering: 0.30\n", + "capturing: 0.30\n", + "morphological: 0.28\n", + "bayesian: 0.28\n", + "--------------------------------------------------\n", + "Topic #373 (Score: 0.13):\n", + "--------------------------------------------------\n", + "simulators: 0.52\n", + "simulation: 0.50\n", + "simulations: 0.49\n", + "simulator: 0.48\n", + "simulating: 0.47\n", + "--------------------------------------------------\n", + "Topic #100 (Score: 0.13):\n", + "--------------------------------------------------\n", + "uavs: 0.43\n", + "uav: 0.37\n", + "quadcopters: 0.35\n", + "uplink: 0.33\n", + "unmanned: 0.33\n", + "--------------------------------------------------\n", + "Topic #120 (Score: 0.13):\n", + "--------------------------------------------------\n", + "forecasting: 0.41\n", + "extrapolation: 0.35\n", + "bayesian: 0.34\n", + "overfitting: 0.34\n", + "predictors: 0.34\n", + "--------------------------------------------------\n", + "Topic #453 (Score: 0.13):\n", + "--------------------------------------------------\n", + "solar: 0.41\n", + "photovoltaic: 0.35\n", + "kwh: 0.34\n", + "renewables: 0.34\n", + "forecasting: 0.32\n", + "--------------------------------------------------\n", + "Topic #348 (Score: 0.13):\n", + "--------------------------------------------------\n", + "hdr: 0.45\n", + "postprocessing: 0.36\n", + "downsampling: 0.34\n", + "landsat: 0.33\n", + "dithering: 0.32\n", + "--------------------------------------------------\n", + "Topic #336 (Score: 0.12):\n", + "--------------------------------------------------\n", + "calibration: 0.48\n", + "calibrated: 0.42\n", + "calibrating: 0.41\n", + "calibrate: 0.37\n", + "opencv: 0.34\n", + "--------------------------------------------------\n", + "Topic #176 (Score: 0.12):\n", + "--------------------------------------------------\n", + "radars: 0.48\n", + "lidar: 0.42\n", + "radar: 0.41\n", + "triangulation: 0.38\n", + "detection: 0.33\n", + "--------------------------------------------------\n", + "Topic #480 (Score: 0.12):\n", + "--------------------------------------------------\n", + "blurring: 0.40\n", + "blur: 0.39\n", + "blurs: 0.36\n", + "blurred: 0.33\n", + "dof: 0.33\n", + "--------------------------------------------------\n", + "Topic #572 (Score: 0.12):\n", + "--------------------------------------------------\n", + "bessel: 0.35\n", + "forecasting: 0.35\n", + "turbines: 0.32\n", + "wind: 0.31\n", + "renewables: 0.30\n", + "--------------------------------------------------\n", + "Topic #402 (Score: 0.12):\n", + "--------------------------------------------------\n", + "spacecraft: 0.44\n", + "interplanetary: 0.43\n", + "lander: 0.37\n", + "asteroids: 0.37\n", + "bfr: 0.36\n", + "--------------------------------------------------\n", + "Topic #54 (Score: 0.12):\n", + "--------------------------------------------------\n", + "triangulation: 0.38\n", + "accelerometer: 0.31\n", + "geospatial: 0.30\n", + "kalman: 0.29\n", + "voronoi: 0.29\n", + "--------------------------------------------------\n", + "List of 20 relevant topics: [245, 201, 285, 497, 286, 514, 666, 639, 371, 373, 100, 120, 453, 348, 336, 176, 480, 572, 402, 54]\n" + ] + } + ], + "source": [ + "#@title Get topic Details\n", + "\n", + "def print_search_topics_details(topic_words, word_scores, topic_scores, topic_nums):\n", + " \"\"\"\n", + " Function to print details of topics found by keywords and return a list of their topic numbers.\n", + " :param topic_words: List of words for each topic.\n", + " :param word_scores: List of word similarity scores with topics.\n", + " :param topic_scores: Relevance scores of topics.\n", + " :param topic_nums: Unique indexes of topics.\n", + " :return: List of unique indexes of found topics.\n", + " \"\"\"\n", + " relevant_topic_nums = [] # List to save numbers of relevant topics\n", + " num_topics = len(topic_nums)\n", + " for i in range(num_topics):\n", + " print(f\"Topic #{topic_nums[i]} (Score: {topic_scores[i]:.2f}):\")\n", + " print(\"-\" * 50)\n", + " for word, score in zip(topic_words[i][:5], word_scores[i][:5]):\n", + " print(f\"{word}: {score:.2f}\")\n", + " print(\"-\" * 50)\n", + " relevant_topic_nums.append(topic_nums[i]) # Add topic number to the list\n", + " return relevant_topic_nums\n", + "\n", + "# Keywords to search for topics\n", + "keywords = [\"uav\",\n", + " \"precision\", \"agriculture\", \"crop\", \"water\",\"farming\", \"landscapes\",\"land\",\"monitoring\",\"mapping\"\n", + " ]\n", + "# Search for topics by keywords\n", + "topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=keywords, num_topics=20)\n", + "\n", + "# Get and print the list of relevant topics\n", + "relevant_topics = print_search_topics_details(topic_words, word_scores, topic_scores, topic_nums)\n", + "\n", + "# Print the list of numbers of relevant topics\n", + "print(\"List of 20 relevant topics:\", relevant_topics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "dmm_RzCHlXbx" + }, + "outputs": [], + "source": [ + "#@title Our 6 topics about uav in ecology and emergency\n", + "\n", + "# Natural disasters and emergencies = \"natural\", \"disasters\", \"relief\", \"modelling\",\"atmospheric\", \"emergency\", \"earthquake\", \"rescue\"\n", + "# Water pollution = \"remote\", \"sensing\", \"water\", \"pollution\", \"quality\", \"monitoring\", \"investigation\", \"machinelearning\"\n", + "# Air pollution = \"gas\", \"pollution\", \"air\", \"sensors\", \"environmental\", \"monitoring\", \"optical\", \"atmospheric\",\"emissions\"\n", + "# Household waste = \"city\", \"dump\", \"emissions\", \"safety\", \"remote\", \"sensing\", \"garbage\",\"household\", \"waste\", \"recycling\"\n", + "# Infrared and thermal mapping = \"population\", \"aerial\", \"surveys\", \"thermal\", \"remote\",\"sensing\", \"infrared\",\"mapping\"\n", + "# Agricultural Mapping and Surveying = \"precision\", \"agriculture\", \"crop\", \"water\",\"farming\", \"landscapes\",\"land\",\"monitoring\",\"mapping\"\n", + "\n", + "# uav_disasters_emergency = [591, 385, 577, 120, 402, 292, 444, 285, 178, 497, 100, 369, 176, 211, 373, 355, 572, 685, 201, 453]\n", + "# uav_water_pollution = [285, 176, 258, 116, 603, 111, 143, 595, 226, 455, 485, 92, 457, 88, 100, 360, 172, 371, 358, 639]\n", + "# uav_air_pollution = [455, 285, 176, 448, 100, 258, 485, 402, 603, 111, 577, 358, 661, 286, 453, 678, 629, 497, 489, 572]\n", + "# uav_household_waste = [661, 285, 455, 100, 620, 176, 676, 448, 402, 348, 642, 595, 178, 371, 123, 387, 286, 66, 258, 201]\n", + "# uav_infrared_thermal = [285, 176, 100, 603, 116, 642, 342, 172, 358, 143, 54, 508, 457, 201, 286, 371, 485, 88, 111, 452]\n", + "# uav_agriculture_mapping = [245, 201, 285, 497, 286, 514, 666, 639, 371, 373, 100, 120, 453, 348, 336, 176, 480, 572, 402, 54]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Trb-oUtm9dhp", + "outputId": "72eacfaf-bd22-492e-aea6-e5711468e4d1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique topics\n", + "Disasters & Emergency Topics: [591, 385, 292, 444, 369, 211, 355, 685]\n", + "Air Pollution Topics: [678, 629, 489]\n", + "Water Pollution Topics: [226, 92, 360]\n", + "Household Waste Topics: [620, 676, 123, 387, 66]\n", + "Infrared & Thermal Topics: [342, 508, 452]\n", + "Agriculture Mapping Topics: [245, 514, 666, 336, 480]\n" + ] + } + ], + "source": [ + "#@title Unique topics\n", + "uav_disasters_emergency = [591, 385, 577, 120, 402, 292, 444, 285, 178, 497, 100, 369, 176, 211, 373, 355, 572, 685, 201, 453]\n", + "uav_water_pollution = [285, 176, 258, 116, 603, 111, 143, 595, 226, 455, 485, 92, 457, 88, 100, 360, 172, 371, 358, 639]\n", + "uav_air_pollution = [455, 285, 176, 448, 100, 258, 485, 402, 603, 111, 577, 358, 661, 286, 453, 678, 629, 497, 489, 572]\n", + "uav_household_waste = [661, 285, 455, 100, 620, 176, 676, 448, 402, 348, 642, 595, 178, 371, 123, 387, 286, 66, 258, 201]\n", + "uav_infrared_thermal = [285, 176, 100, 603, 116, 642, 342, 172, 358, 143, 54, 508, 457, 201, 286, 371, 485, 88, 111, 452]\n", + "uav_agriculture_mapping = [245, 201, 285, 497, 286, 514, 666, 639, 371, 373, 100, 120, 453, 348, 336, 176, 480, 572, 402, 54]\n", + "# Combining all lists into one for further analysis\n", + "all_topics = uav_disasters_emergency + uav_air_pollution + uav_water_pollution + uav_household_waste + uav_infrared_thermal + uav_agriculture_mapping\n", + "\n", + "# Creating a list of unique topics for each group\n", + "unique_disasters_emergency = [topic for topic in uav_disasters_emergency if all_topics.count(topic) == 1]\n", + "unique_air_pollution = [topic for topic in uav_air_pollution if all_topics.count(topic) == 1]\n", + "unique_water_pollution = [topic for topic in uav_water_pollution if all_topics.count(topic) == 1]\n", + "unique_household_waste = [topic for topic in uav_household_waste if all_topics.count(topic) == 1]\n", + "unique_infrared_thermal = [topic for topic in uav_infrared_thermal if all_topics.count(topic) == 1]\n", + "unique_agriculture_mapping = [topic for topic in uav_agriculture_mapping if all_topics.count(topic) == 1]\n", + "\n", + "# Printing unique lists of topics\n", + "print(\"Unique topics\")\n", + "print(\"Disasters & Emergency Topics:\", unique_disasters_emergency)\n", + "print(\"Air Pollution Topics:\", unique_air_pollution)\n", + "print(\"Water Pollution Topics:\", unique_water_pollution)\n", + "print(\"Household Waste Topics:\", unique_household_waste)\n", + "print(\"Infrared & Thermal Topics:\", unique_infrared_thermal)\n", + "print(\"Agriculture Mapping Topics:\", unique_agriculture_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4vqSgOq5-B5L", + "outputId": "17c996fc-3fdd-4d51-dc7f-5e7ea865a947" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intersecting topics: {258, 642, 143, 402, 661, 285, 286, 172, 176, 178, 54, 572, 448, 577, 453, 455, 201, 457, 595, 88, 603, 348, 100, 485, 358, 111, 497, 371, 116, 373, 120, 639}\n" + ] + } + ], + "source": [ + "#@title Intersecting topics\n", + "#TODO find a way to use this information\n", + "# Collecting all lists into a dictionary for easier processing\n", + "topics_groups_uav = {\n", + " \"disasters_emergency\": uav_disasters_emergency,\n", + " \"air_pollution\": uav_air_pollution,\n", + " \"water_pollution\": uav_water_pollution,\n", + " \"household_waste\": uav_household_waste,\n", + " \"infrared_thermal\": uav_infrared_thermal,\n", + " \"agriculture_mapping\": uav_agriculture_mapping\n", + "}\n", + "\n", + "# Creating a list of all topics\n", + "all_topics = sum(topics_groups_uav.values(), [])\n", + "\n", + "# Determining topics that occur more than once (intersect between groups)\n", + "intersecting_topics = set(topic for topic in all_topics if all_topics.count(topic) > 1)\n", + "\n", + "print(\"Intersecting topics:\", intersecting_topics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NxhWcPa0vHoI" + }, + "source": [ + "In the code below, we use function assign_and_filter_topics, users can pass their own DataFrame, topic_groups dictionary, and optionally specify a start_date and end_date for filtering. The function then assigns topic groups, filters the DataFrame based on the specified topic numbers and date range, and returns the filtered DataFrame. This makes it more flexible and reusable for different sets of topic groups and date ranges." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 487 + }, + "id": "59XZUxOf007H", + "outputId": "707bd8bc-4ed7-4c58-d695-2a231e792c01" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df_filtered\",\n \"rows\": 10556,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10523,\n \"samples\": [\n \"predicting nuclear masses with product-unit networks\",\n \"exploring consequential robot sound: should we make robots quiet and kawaii-et?\",\n \"kalman's shrinkage for wavelet-based despeckling of sar images\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10527,\n \"samples\": [\n \"2304.11255\",\n \"2203.16756\",\n \"2010.07260\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"abstract\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10522,\n \"samples\": [\n \"camera sensor noise is one of the most reliable device characteristics indigital image forensics, enabling the unique linkage of images to digitalcameras. this so-called camera fingerprint gives rise to differentapplications, such as image forensics and authentication. however, if imagesare publicly available, an adversary can estimate the fingerprint from hervictim and plant it into spurious images. the concept of fragile camerafingerprints addresses this attack by exploiting asymmetries in data access:while the camera owner will always have access to a full fingerprint fromuncompressed images, the adversary has typically access to compressed imagesand thus only to a truncated fingerprint. the security of this defense,however, has not been systematically explored yet. this paper provides thefirst comprehensive analysis of fragile camera fingerprints under attack. aseries of theoretical and practical tests demonstrate that fragile camerafingerprints allow a reliable device identification for common compressionlevels in an adversarial environment.\",\n \"we present a system for generating inconspicuous-looking textures that, whendisplayed in the physical world as digital or printed posters, cause visualobject tracking systems to become confused. for instance, as a target beingtracked by a robot's camera moves in front of such a poster, our generatedtexture makes the tracker lock onto it and allows the target to evade. thiswork aims to fool seldom-targeted regression tasks, and in particular comparesdiverse optimization strategies: non-targeted, targeted, and a new family ofguided adversarial losses. while we use the expectation over transformation(eot) algorithm to generate physical adversaries that fool tracking models whenimaged under diverse conditions, we compare the impacts of differentconditioning variables, including viewpoint, lighting, and appearances, to findpractical attack setups with high resulting adversarial strength andconvergence speed. we further showcase textures optimized solely usingsimulated scenes can confuse real-world tracking systems.\",\n \"dynamic allocation of resources to the \\\\emph{best} link in large multiusernetworks offers considerable improvement in spectral efficiency. this gain,often referred to as \\\\emph{multiuser diversity gain}, can be cast asdouble-logarithmic growth of the network throughput with the number of users.in this paper we consider large cognitive networks granted concurrent spectrumaccess with license-holding users. the primary network affords to share itsunder-utilized spectrum bands with the secondary users. we assess the optimalmultiuser diversity gain in the cognitive networks by quantifying how thesum-rate throughput of the network scales with the number of secondary users.for this purpose we look at the optimal pairing of spectrum bands and secondaryusers, which is supervised by a central entity fully aware of the instantaneouschannel conditions, and show that the throughput of the cognitive networkscales double-logarithmically with the number of secondary users ($n$) andlinearly with the number of available spectrum bands ($m$), i.e., $m\\\\log\\\\logn$. we then propose a \\\\emph{distributed} spectrum allocation scheme, which doesnot necessitate a central controller or any information exchange betweendifferent secondary users and still obeys the optimal throughput scaling law.this scheme requires that \\\\emph{some} secondary transmitter-receiver pairsexchange $\\\\log m$ information bits among themselves. we also show that theaggregate amount of information exchange between secondary transmitter-receiverpairs is {\\\\em asymptotically} equal to $m\\\\log m$. finally, we show that ourdistributed scheme guarantees fairness among the secondary users, meaning thatthey are equally likely to get access to an available spectrum band.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"categories\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1762,\n \"samples\": [\n \"eess.sy cs.na cs.sy math.na\",\n \"cs.ce stat.ml\",\n \"stat.ml cs.it math.it\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"doi\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2310,\n \"samples\": [\n \"10.1016/j.cpc.2022.108406\",\n \"10.1007/s00500-017-2516-8\",\n \"10.1016/j.cmpb.2019.105004\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"created\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2010-01-02 00:00:00\",\n \"max\": \"2023-12-28 00:00:00\",\n \"num_unique_values\": 3606,\n \"samples\": [\n \"2023-03-28 00:00:00\",\n \"2016-02-11 00:00:00\",\n \"2019-10-26 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"updated\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2010-02-03 00:00:00\",\n \"max\": \"2023-12-29 00:00:00\",\n \"num_unique_values\": 2021,\n \"samples\": [\n \"2018-05-08 00:00:00\",\n \"2021-08-04 00:00:00\",\n \"2018-05-24 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"authors\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9949,\n \"samples\": [\n \"['lin', 'terejanu']\",\n \"['schoder', 'roppert']\",\n \"['dey', 'hou']\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"url\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10527,\n \"samples\": [\n \"https://arxiv.org/abs/2304.11255\",\n \"https://arxiv.org/abs/2203.16756\",\n \"https://arxiv.org/abs/2010.07260\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"abstract_length\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 348,\n \"min\": 321,\n \"max\": 3051,\n \"num_unique_values\": 1518,\n \"samples\": [\n 1265,\n 638,\n 1676\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id_n\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 159407,\n \"min\": 712,\n \"max\": 555424,\n \"num_unique_values\": 10556,\n \"samples\": [\n 97863,\n 417628,\n 428602\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"topic_num\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 162,\n \"min\": 66,\n \"max\": 685,\n \"num_unique_values\": 27,\n \"samples\": [\n 226,\n 336,\n 342\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"topic_representation\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 27,\n \"samples\": [\n \"raytracing, downsampling, opencv, stm, computationally\",\n \"calibration, calibrated, calibrating, calibrate, opencv\",\n \"haptic, simulated, gyroscopes, vr, haptics\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"topic_group\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Household Waste\",\n \"Water Pollution\",\n \"Air Pollution\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"year\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 2010,\n \"max\": 2023,\n \"num_unique_values\": 14,\n \"samples\": [\n 2019,\n 2021,\n 2010\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df_filtered" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleidabstractcategoriesdoicreatedupdatedauthorsurlabstract_lengthid_ntopic_numtopic_representationtopic_groupyear
712pseudorandomness in central force optimization1001.0317central force optimization is a deterministic ...cs.ohnan2010-01-022010-02-03['formato']https://arxiv.org/abs/1001.0317108971266bayesian, kalman, gaussian, filtering, laplacianHousehold Waste2010
772construction of wiretap codes from ordinary ch...1001.1197from an arbitrary given channel code over a di...cs.it cs.cr math.it10.1109/isit.2010.55137942010-01-08NaT['hayashi', 'matsumoto']https://arxiv.org/abs/1001.1197335772123cryptography, rssi, transceiver, ciphers, deco...Household Waste2010
834a new method to extract dorsal hand vein patte...1001.1966among all biometric, dorsal hand vein pattern ...cs.cv cs.crnan2010-01-12NaT['khan', 'khan']https://arxiv.org/abs/1001.1966656834360fingerprint, fingerprinting, fingerprints, bio...Water Pollution2010
855message detection and extraction of chaotic op...1001.2060the security of chaotic optical communication ...cs.crnan2010-01-122010-03-18['zhao', 'yin']https://arxiv.org/abs/1001.2060639855123cryptography, rssi, transceiver, ciphers, deco...Household Waste2010
869towards a generic framework to generate explan...1001.2188in this report, we show how to use the simple ...cs.plnan2010-01-13NaT['deransart', 'oliveira']https://arxiv.org/abs/1001.218875286992tracking, tracker, tracked, triangulation, com...Water Pollution2010
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " title id \\\n", + "712 pseudorandomness in central force optimization 1001.0317 \n", + "772 construction of wiretap codes from ordinary ch... 1001.1197 \n", + "834 a new method to extract dorsal hand vein patte... 1001.1966 \n", + "855 message detection and extraction of chaotic op... 1001.2060 \n", + "869 towards a generic framework to generate explan... 1001.2188 \n", + "\n", + " abstract categories \\\n", + "712 central force optimization is a deterministic ... cs.oh \n", + "772 from an arbitrary given channel code over a di... cs.it cs.cr math.it \n", + "834 among all biometric, dorsal hand vein pattern ... cs.cv cs.cr \n", + "855 the security of chaotic optical communication ... cs.cr \n", + "869 in this report, we show how to use the simple ... cs.pl \n", + "\n", + " doi created updated \\\n", + "712 nan 2010-01-02 2010-02-03 \n", + "772 10.1109/isit.2010.5513794 2010-01-08 NaT \n", + "834 nan 2010-01-12 NaT \n", + "855 nan 2010-01-12 2010-03-18 \n", + "869 nan 2010-01-13 NaT \n", + "\n", + " authors url \\\n", + "712 ['formato'] https://arxiv.org/abs/1001.0317 \n", + "772 ['hayashi', 'matsumoto'] https://arxiv.org/abs/1001.1197 \n", + "834 ['khan', 'khan'] https://arxiv.org/abs/1001.1966 \n", + "855 ['zhao', 'yin'] https://arxiv.org/abs/1001.2060 \n", + "869 ['deransart', 'oliveira'] https://arxiv.org/abs/1001.2188 \n", + "\n", + " abstract_length id_n topic_num \\\n", + "712 1089 712 66 \n", + "772 335 772 123 \n", + "834 656 834 360 \n", + "855 639 855 123 \n", + "869 752 869 92 \n", + "\n", + " topic_representation topic_group year \n", + "712 bayesian, kalman, gaussian, filtering, laplacian Household Waste 2010 \n", + "772 cryptography, rssi, transceiver, ciphers, deco... Household Waste 2010 \n", + "834 fingerprint, fingerprinting, fingerprints, bio... Water Pollution 2010 \n", + "855 cryptography, rssi, transceiver, ciphers, deco... Household Waste 2010 \n", + "869 tracking, tracker, tracked, triangulation, com... Water Pollution 2010 " + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#@title Assigning Topic Groups to Filtered DataFrame\n", + "\n", + "# Define topic groups\n", + "topic_groups = {\n", + " 'Disasters & Emergency': [591, 385, 292, 444, 369, 211, 355, 685],\n", + " 'Air Pollution': [678, 629, 489],\n", + " 'Water Pollution': [226, 92, 360],\n", + " 'Household Waste': [620, 676, 123, 387, 66],\n", + " 'Infrared & Thermal': [342, 508, 452],\n", + " 'Agriculture Mapping': [245, 514, 666, 336, 480]\n", + "}\n", + "\n", + "# Function to assign topic group based on topic number\n", + "def assign_topic_group(topic_num):\n", + " for group_name, topics in topic_groups.items():\n", + " if topic_num in topics:\n", + " return group_name\n", + " return 'Other'\n", + "\n", + "# Convert 'created' column to datetime format\n", + "df['created'] = pd.to_datetime(df['created'])\n", + "\n", + "# Combine all topics into a single list for filtering\n", + "all_topics = sum(topic_groups.values(), [])\n", + "\n", + "# Filter the DataFrame based on topic numbers and date range, and create a copy\n", + "df_filtered = df[(df['topic_num'].isin(all_topics)) &\n", + " (df['created'] >= '2010-01-01') &\n", + " (df['created'] <= '2023-12-31')].copy()\n", + "\n", + "# Assign topic groups and extract the year from the 'created' column\n", + "df_filtered.loc[:, 'topic_group'] = df_filtered['topic_num'].apply(assign_topic_group)\n", + "df_filtered.loc[:, 'year'] = df_filtered['created'].dt.year\n", + "\n", + "# Display the first few rows of the filtered DataFrame\n", + "df_filtered.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UFpFTLWTMgye" + }, + "source": [ + "### Analysis of the dynamics of thematic groups\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UGIVdvlbv1ge" + }, + "source": [ + "In the code below, we analyze the growth and decline in interest in topics over time. We calculate the yearly change in the number of publications for each topic, the acceleration of growth, and the relative growth.\n", + "\n", + "We then identify the top 5 topics with the greatest increase and decrease in interest, as well as low-volume but fast-growing topic groups.\n", + "\n", + "The results are stored in a dictionary for further analysis and visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M4nUW-yEt0mS", + "outputId": "397dfe1c-21eb-488b-ee69-d28cbb495968" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 5 topics with the greatest increase in interest:\n", + "topic_num\n", + "92 179.0\n", + "66 157.0\n", + "211 143.0\n", + "245 137.0\n", + "226 130.0\n", + "dtype: float64\n", + "\n", + "Top 5 topics with the greatest decrease in interest:\n", + "topic_num\n", + "452 -10.0\n", + "685 1.0\n", + "676 4.0\n", + "620 8.0\n", + "666 8.0\n", + "dtype: float64\n", + "\n", + "Low-volume, fast-growing topic groups:\n", + "topic_num\n", + "678 9.0\n", + "676 4.0\n", + "685 1.0\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "#@title Analyzing Topic Growth and Decline\n", + "\n", + "# Count the number of publications for each topic by year\n", + "publications_per_topic_year = df_filtered.groupby(['topic_num', 'year']).size().unstack(fill_value=0)\n", + "\n", + "# Calculate the yearly change in the number of publications for each topic (growth rate)\n", + "growth_per_topic = publications_per_topic_year.diff(axis=1)\n", + "\n", + "# Calculate the acceleration of growth (change in growth rate)\n", + "acceleration_per_topic = growth_per_topic.diff(axis=1)\n", + "\n", + "# Calculate the relative growth for each year\n", + "relative_growth_per_topic = growth_per_topic / publications_per_topic_year.shift(1)\n", + "\n", + "# Sum the changes to assess the overall increase/decrease in interest in topics over the entire period\n", + "total_growth_per_topic = growth_per_topic.sum(axis=1)\n", + "\n", + "# Topics with the greatest increase in interest\n", + "top_growing_topics = total_growth_per_topic.nlargest(5)\n", + "\n", + "# Topics with the greatest decrease in interest\n", + "top_declining_topics = total_growth_per_topic.nsmallest(5)\n", + "\n", + "# Identify low-volume but fast-growing topic groups\n", + "volume_threshold = 50\n", + "low_volume_topics = publications_per_topic_year[publications_per_topic_year.sum(axis=1) < volume_threshold].index\n", + "low_volume_fast_growing_topics = total_growth_per_topic[low_volume_topics].nlargest(5)\n", + "\n", + "# Store the results in a dictionary or DataFrame for further analysis or visualization\n", + "analysis_results = {\n", + " 'top_growing_topics': top_growing_topics,\n", + " 'top_declining_topics': top_declining_topics,\n", + " 'low_volume_fast_growing_topics': low_volume_fast_growing_topics,\n", + " # Add other metrics here as needed\n", + "}\n", + "\n", + "# Display the results\n", + "print(\"Top 5 topics with the greatest increase in interest:\")\n", + "print(top_growing_topics)\n", + "print(\"\\nTop 5 topics with the greatest decrease in interest:\")\n", + "print(top_declining_topics)\n", + "print(\"\\nLow-volume, fast-growing topic groups:\")\n", + "print(low_volume_fast_growing_topics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qF4mnY2T0zHO", + "outputId": "423a6b19-c9d9-49d0-e790-259723635277" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Topic num: 66, Count: 1835, Group: Household Waste\n", + "Topic num: 123, Count: 1065, Group: Household Waste\n", + "Topic num: 360, Count: 334, Group: Water Pollution\n", + "Topic num: 92, Count: 1378, Group: Water Pollution\n", + "Topic num: 452, Count: 216, Group: Infrared & Thermal\n", + "Topic num: 369, Count: 320, Group: Disasters & Emergency\n", + "Topic num: 292, Count: 458, Group: Disasters & Emergency\n", + "Topic num: 245, Count: 571, Group: Agriculture Mapping\n", + "Topic num: 226, Count: 616, Group: Water Pollution\n", + "Topic num: 342, Count: 367, Group: Infrared & Thermal\n", + "Topic num: 591, Count: 117, Group: Disasters & Emergency\n", + "Topic num: 385, Count: 300, Group: Disasters & Emergency\n", + "Topic num: 387, Count: 298, Group: Household Waste\n", + "Topic num: 336, Count: 381, Group: Agriculture Mapping\n", + "Topic num: 678, Count: 42, Group: Air Pollution\n", + "Topic num: 508, Count: 176, Group: Infrared & Thermal\n", + "Topic num: 676, Count: 42, Group: Household Waste\n", + "Topic num: 489, Count: 186, Group: Air Pollution\n", + "Topic num: 211, Count: 643, Group: Disasters & Emergency\n", + "Topic num: 685, Count: 38, Group: Disasters & Emergency\n", + "Topic num: 514, Count: 170, Group: Agriculture Mapping\n", + "Topic num: 629, Count: 89, Group: Air Pollution\n", + "Topic num: 666, Count: 52, Group: Agriculture Mapping\n", + "Topic num: 355, Count: 346, Group: Disasters & Emergency\n", + "Topic num: 480, Count: 195, Group: Agriculture Mapping\n", + "Topic num: 620, Count: 93, Group: Household Waste\n", + "Topic num: 444, Count: 228, Group: Disasters & Emergency\n" + ] + } + ], + "source": [ + "# Create a dictionary to store the counts and groups\n", + "topic_info = {}\n", + "\n", + "# Loop through each row in the dataframe\n", + "for index, row in df_filtered.iterrows():\n", + " # Get the topic_num and topic_group values\n", + " topic_num = row[\"topic_num\"]\n", + " topic_group = row[\"topic_group\"]\n", + "\n", + " # If the topic_num is not already in the dictionary, add it with a count of 1 and the topic_group\n", + " if topic_num not in topic_info:\n", + " topic_info[topic_num] = {\"count\": 1, \"group\": topic_group}\n", + " # Otherwise, increment the count for that topic_num\n", + " else:\n", + " topic_info[topic_num][\"count\"] += 1\n", + "\n", + "# Print the topic_num counts and groups\n", + "for topic_num, info in topic_info.items():\n", + " print(f\"Topic num: {topic_num}, Count: {info['count']}, Group: {info['group']}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LFoWxKPl0a4_", + "outputId": "bf77d126-83dc-4c7d-9197-46c25d57ec66" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(10556, 15)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_filtered.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gsbwDpotzUrY", + "outputId": "0f7aba53-fcf1-4e95-d6bf-fd95c7d59fcc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Insights for Topic #92:\n", + "--------------------------------------------------\n", + "Top Keywords:\n", + "the: 11147\n", + "and: 6288\n", + "of: 5793\n", + "a: 5170\n", + "to: 5169\n", + "in: 4485\n", + "tracking: 3317\n", + "we: 2814\n", + "is: 2569\n", + "for: 2336\n", + "\n", + "Year Distribution:\n", + "2010 5\n", + "2011 8\n", + "2012 22\n", + "2013 14\n", + "2014 19\n", + "2015 44\n", + "2016 53\n", + "2017 110\n", + "2018 119\n", + "2019 169\n", + "2020 212\n", + "2021 212\n", + "2022 207\n", + "2023 184\n", + "Name: year, dtype: int64\n", + "\n", + "Top Authors:\n", + "'wang': 158\n", + "'li': 134\n", + "'zhang': 122\n", + "'liu': 80\n", + "'yang': 66\n", + "\n", + "Summary Statistics:\n", + "Total Papers: 1378\n", + "Publication Years: 2010 - 2023\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "import pandas as pd\n", + "\n", + "# Selecting a topic for analysis\n", + "topic_num_to_analyze = 92\n", + "\n", + "# Filtering the DataFrame for the selected topic\n", + "topic_info = df_filtered[df_filtered['topic_num'] == topic_num_to_analyze]\n", + "\n", + "# Displaying aggregated information about the selected topic\n", + "print(f\"Insights for Topic #{topic_num_to_analyze}:\")\n", + "print(\"-\" * 50)\n", + "\n", + "# Top Keywords\n", + "top_keywords = Counter(\" \".join(topic_info['abstract']).split()).most_common(10)\n", + "print(\"Top Keywords:\")\n", + "for keyword, count in top_keywords:\n", + " print(f\"{keyword}: {count}\")\n", + "\n", + "# Year Distribution\n", + "year_distribution = topic_info['year'].value_counts().sort_index()\n", + "print(\"\\nYear Distribution:\")\n", + "print(year_distribution)\n", + "\n", + "# Top Authors (assuming a 'authors' column exists)\n", + "top_authors = Counter(\", \".join(topic_info['authors']).split(\", \")).most_common(5)\n", + "print(\"\\nTop Authors:\")\n", + "for author, count in top_authors:\n", + " print(f\"{author}: {count}\")\n", + "\n", + "# Summary Statistics\n", + "print(\"\\nSummary Statistics:\")\n", + "print(f\"Total Papers: {len(topic_info)}\")\n", + "print(f\"Publication Years: {topic_info['year'].min()} - {topic_info['year'].max()}\")\n", + "\n", + "print(\"-\" * 50)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P2gbc3JfaT6P", + "outputId": "f331c0d8-6bf5-4962-cbd5-433121c1aa45" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Analysis for Thematic Group: Disasters & Emergency\n", + "| Year | Number of Publications | Growth Acceleration | Change in Number of Publications | Relative Growth |\n", + "|-------:|-------------------------:|----------------------:|-----------------------------------:|:------------------|\n", + "| 2010 | 19 | 0 | 0 | 0.0% |\n", + "| 2011 | 15 | -4 | -4 | -21.05% |\n", + "| 2012 | 28 | 17 | 13 | 86.67% |\n", + "| 2013 | 38 | -3 | 10 | 35.71% |\n", + "| 2014 | 28 | -20 | -10 | -26.32% |\n", + "| 2015 | 47 | 29 | 19 | 67.86% |\n", + "| 2016 | 63 | -3 | 16 | 34.04% |\n", + "| 2017 | 94 | 15 | 31 | 49.21% |\n", + "| 2018 | 173 | 48 | 79 | 84.04% |\n", + "| 2019 | 266 | 14 | 93 | 53.76% |\n", + "| 2020 | 337 | -22 | 71 | 26.69% |\n", + "| 2021 | 380 | -28 | 43 | 12.76% |\n", + "| 2022 | 453 | 30 | 73 | 19.21% |\n", + "| 2023 | 509 | -17 | 56 | 12.36% |\n" + ] + } + ], + "source": [ + "from tabulate import tabulate\n", + "\n", + "#@title Analyzing a specific thematic group\n", + "group_name_to_analyze = \"Disasters & Emergency\" # Replace with the name of the group you are interested in\n", + "\n", + "# Get the topic numbers for the selected thematic group\n", + "topic_nums = topic_groups[group_name_to_analyze]\n", + "\n", + "# Filter data for the selected group\n", + "group_data = df_filtered[df_filtered['topic_num'].isin(topic_nums)]\n", + "\n", + "# Aggregate data by year for the selected group\n", + "group_publications_per_year = group_data.groupby('year')['topic_num'].count()\n", + "group_growth = group_publications_per_year.diff().fillna(0)\n", + "group_relative_growth = (group_growth / group_publications_per_year.shift(1) * 100).fillna(0)\n", + "\n", + "# Create a DataFrame with the dynamics analysis for the selected group\n", + "group_analysis = pd.DataFrame({\n", + " 'Year': group_publications_per_year.index,\n", + " 'Number of Publications': group_publications_per_year.values,\n", + " 'Growth Acceleration': group_growth.diff().fillna(0).round(2), # Adding growth acceleration\n", + " 'Change in Number of Publications': group_growth.values,\n", + " 'Relative Growth': group_relative_growth.round(2).astype(str) + '%' # Rounding and formatting relative growth\n", + "}).set_index('Year')\n", + "\n", + "# Display the analysis for the selected thematic group\n", + "print(f\"Analysis for Thematic Group: {group_name_to_analyze}\")\n", + "print(tabulate(group_analysis, headers='keys', tablefmt='pipe', showindex=True))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "crZpBYoDsJZi", + "outputId": "e55b26c1-926c-4e56-e2a3-6e31ad0a92fc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Household Waste' 'Water Pollution' 'Infrared & Thermal'\n", + " 'Disasters & Emergency' 'Agriculture Mapping' 'Air Pollution']\n" + ] + } + ], + "source": [ + "# Get unique values of the \"topic_group\" column\n", + "unique_values = df_filtered[\"topic_group\"].unique()\n", + "\n", + "print(unique_values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kbtsUUvgLYPl", + "outputId": "aa35f38c-9797-4482-95a1-ca53d07590ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thematic Group: Water Pollution\n", + "Total Publications: 2328\n", + "Total Growth: 339.0\n", + "Average Annual Growth: 26.08\n" + ] + } + ], + "source": [ + "# Selecting the thematic group for analysis\n", + "group_to_analyze = \"Water Pollution\"\n", + "\n", + "# Extracting data for the selected thematic group\n", + "group_data = df_filtered[df_filtered['topic_group'] == group_to_analyze]\n", + "\n", + "# Calculating the number of publications per year for the selected group\n", + "group_publications_per_year = group_data.groupby('year').size()\n", + "\n", + "# Calculating the total number of publications and the total growth over the entire period\n", + "total_publications = group_publications_per_year.sum()\n", + "total_growth = group_publications_per_year.diff().sum()\n", + "\n", + "# Calculating the average annual growth\n", + "average_annual_growth = total_growth / (group_publications_per_year.index.max() - group_publications_per_year.index.min())\n", + "\n", + "# Printing the summary statistics\n", + "print(f\"Thematic Group: {group_to_analyze}\")\n", + "print(f\"Total Publications: {total_publications}\")\n", + "print(f\"Total Growth: {total_growth}\")\n", + "print(f\"Average Annual Growth: {average_annual_growth:.2f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 559 + }, + "id": "52_qVvXzbtwy", + "outputId": "9511ceb0-339a-4e8f-e82b-c47bc21395d8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trend Analysis for Topic #92 (Water Pollution):\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@title Visualization of the topic trend analysis\n", + "import plotly.graph_objects as go\n", + "import numpy as np\n", + "\n", + "def visualize_topic_trend_plotly(topic_num, topic_group):\n", + " # Extract data for the specific topic\n", + " publications = publications_per_topic_year.loc[topic_num]\n", + " changes = growth_per_topic.loc[topic_num]\n", + " relative_growth = relative_growth_per_topic.loc[topic_num] * 100 # Convert to percentage\n", + "\n", + " # Replace NaN values with zeros\n", + " relative_growth = relative_growth.fillna(0)\n", + "\n", + " years = publications.index\n", + "\n", + " # Create a plot for the number of publications\n", + " fig = go.Figure()\n", + " fig.add_trace(go.Scatter(x=years, y=publications, mode='lines+markers', name='Number of Publications', marker=dict(size=8, color='blue')))\n", + "\n", + " # Create a plot for the change in the number of publications\n", + " fig.add_trace(go.Bar(x=years, y=changes, name='Change in Number of Publications', marker_color='orange', opacity=0.6))\n", + "\n", + " # Create a plot for the relative growth\n", + " fig.add_trace(go.Scatter(x=years, y=relative_growth, mode='lines', name='Relative Growth (%)', yaxis='y2', line=dict(color='green', width=2, dash='dash')))\n", + "\n", + " # Customize the layout\n", + " fig.update_layout(\n", + " title=f'Trend Analysis for Topic {topic_num} ({topic_group})',\n", + " xaxis_title='Year',\n", + " yaxis_title='Number of Publications',\n", + " yaxis2=dict(title='Relative Growth (%)', overlaying='y', side='right', range=[-100, 100]),\n", + " legend=dict(x=1.05, y=1, traceorder='reversed', font_size=16),\n", + " barmode='overlay',\n", + " template='plotly_white'\n", + " )\n", + "\n", + " # Show the plot\n", + " fig.show()\n", + "\n", + "# Example usage\n", + "topic_num_to_analyze = 92\n", + "topic_group = df_filtered[df_filtered['topic_num'] == topic_num_to_analyze]['topic_group'].iloc[0]\n", + "print(f\"Trend Analysis for Topic #{topic_num_to_analyze} ({topic_group}):\")\n", + "visualize_topic_trend_plotly(topic_num_to_analyze, topic_group)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "id": "_P3gD_opD4Eg", + "outputId": "94765d35-f1e8-42a5-91e5-d5558af0bc49" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@title Visualization of the thematic group trend analysis\n", + "def visualize_topic_trend_plotly(group_name, df_filtered, topics_groups):\n", + " # Filter the dataframe for the selected topic group\n", + " topic_nums = topics_groups[group_name]\n", + " df_group = df_filtered[df_filtered['topic_num'].isin(topic_nums)]\n", + "\n", + " # Group by year and sum up publications\n", + " publications_per_year = df_group.groupby('year')['topic_num'].count()\n", + "\n", + " # Calculate changes and relative growth\n", + " changes = publications_per_year.diff().fillna(0)\n", + " relative_growth = (changes / publications_per_year.shift(1) * 100).fillna(0)\n", + "\n", + " years = publications_per_year.index\n", + "\n", + " # Create a plot for the number of publications\n", + " fig = go.Figure()\n", + " fig.add_trace(go.Scatter(x=years, y=publications_per_year, mode='lines+markers', name='Number of Publications', marker=dict(size=8, color='blue')))\n", + "\n", + " # Create a plot for the change in the number of publications\n", + " fig.add_trace(go.Bar(x=years, y=changes, name='Change in Number of Publications', marker_color='orange', opacity=0.6))\n", + "\n", + " # Create a plot for the relative growth\n", + " fig.add_trace(go.Scatter(x=years, y=relative_growth, mode='lines', name='Relative Growth (%)', yaxis='y2', line=dict(color='green', width=2, dash='dash')))\n", + "\n", + " # Customize the layout\n", + " fig.update_layout(\n", + " title=f'Trend Analysis for {group_name}',\n", + " xaxis_title='Year',\n", + " yaxis_title='Number of Publications',\n", + " yaxis2=dict(title='Relative Growth (%)', overlaying='y', side='right'),\n", + " legend=dict(x=1.05, y=1, traceorder='reversed', font_size=16),\n", + " barmode='overlay',\n", + " template='plotly_white'\n", + " )\n", + "\n", + " # Show the plot\n", + " fig.show()\n", + "\n", + "# Example usage\n", + "group_name = 'Household Waste' # Replace with the actual name of the group\n", + "visualize_topic_trend_plotly(group_name, df_filtered, topic_groups)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MH48s7AUTlV-" + }, + "source": [ + "#### Plots of median? # TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k_qsdAErdZLh" + }, + "source": [ + "### Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BVRut6lvKywp" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import plotly.express as px\n", + "\n", + "# Grouping by month and year of creation and thematic group\n", + "df_grouped = df_filtered.groupby([df_filtered['created'].dt.to_period(\"M\"), 'topic_group']).size().reset_index(name='counts')\n", + "df_grouped['created'] = df_grouped['created'].dt.to_timestamp()\n", + "\n", + "# For convenience, add a column with the number of months since the start of 2010\n", + "df_grouped['months_since_start'] = (df_grouped['created'].dt.year - 2010) * 12 + df_grouped['created'].dt.month - 1\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 617 + }, + "id": "KLANWHXfW4Lo", + "outputId": "8e157a6a-a480-4ab9-f11c-2f8858d010cd" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#@title Polynomial Trends\n", + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "# Prepare colors for each thematic group\n", + "colors = {\n", + " 'Disasters & Emergency': 'rgba(255, 99, 132, 1)', # Red\n", + " 'Air Pollution': 'rgba(54, 162, 235, 1)', # Blue\n", + " 'Water Pollution': 'rgba(255, 206, 86, 1)', # Yellow\n", + " 'Household Waste': 'rgba(75, 192, 192, 1)', # Green\n", + " 'Infrared & Thermal': 'rgba(153, 102, 255, 1)', # Purple\n", + " 'Agriculture Mapping': 'rgba(10, 0, 99, 255)' # Cyan\n", + "}\n", + "\n", + "# Initialize the start year\n", + "start_year = 2010\n", + "\n", + "fig = go.Figure()\n", + "\n", + "# Convert 'months_since_start' to 'year-month' for tooltips\n", + "def convert_to_year_month(months_since_start):\n", + " year = start_year + months_since_start // 12\n", + " month = months_since_start % 12 + 1 # +1 because counting starts from 0\n", + " return f\"{year}-{month:02d}\"\n", + "\n", + "# Add data and trends for each group\n", + "for group_name in topic_groups.keys():\n", + " df_filtered_group = df_grouped[df_grouped['topic_group'] == group_name]\n", + " if not df_filtered_group.empty:\n", + " X = df_filtered_group['months_since_start'].values.reshape(-1, 1)\n", + " y = df_filtered_group['counts']\n", + " poly = PolynomialFeatures(degree=3)\n", + " X_poly = poly.fit_transform(X)\n", + " model = LinearRegression()\n", + " model.fit(X_poly, y)\n", + " X_pred = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)\n", + " X_pred_poly = poly.transform(X_pred)\n", + " y_pred = model.predict(X_pred_poly)\n", + "\n", + " tooltips = [f\"{convert_to_year_month(m)}, Publications: {c}\" for m, c in zip(df_filtered_group['months_since_start'], df_filtered_group['counts'])]\n", + "\n", + " # Determine color for the current group\n", + " color = colors.get(group_name, 'rgba(0, 0, 0, 1)') # Black by default\n", + "\n", + " fig.add_trace(go.Scatter(\n", + " x=df_filtered_group['months_since_start'],\n", + " y=df_filtered_group['counts'],\n", + " mode='markers',\n", + " name=f'Real Data {group_name}',\n", + " text=tooltips,\n", + " hoverinfo='text',\n", + " marker=dict(color=color, opacity=0.5) # Point transparency\n", + " ))\n", + "\n", + " fig.add_trace(go.Scatter(\n", + " x=X_pred.flatten(),\n", + " y=y_pred,\n", + " mode='lines',\n", + " name=f'Trend {group_name}',\n", + " line=dict(color=color) # Line color\n", + " ))\n", + "\n", + "# Plot settings\n", + "fig.update_layout(\n", + " title='Polynomial Regression for All Thematic Groups',\n", + " xaxis_title='Year',\n", + " yaxis_title='Number of Publications',\n", + " legend_title='Thematic Group',\n", + " width=1200, height=600\n", + ")\n", + "\n", + "# Update the X-axis to display years\n", + "max_months_since_start = df_grouped['months_since_start'].max()\n", + "ticks_vals = np.arange(0, max_months_since_start + 1, 12) # Every 12 months\n", + "ticks_text = [str(start_year + int(months / 12)) for months in ticks_vals]\n", + "fig.update_xaxes(tickvals=ticks_vals, ticktext=ticks_text)\n", + "\n", + "# Show the plot\n", + "fig.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bztqJLM_jax0", + "outputId": "523cdb04-c0fb-4a49-91ce-c76d8d827bdd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average yearly change in the number of publications: 3175.9333333333334\n", + "Topics with yearly change above the overall average:\n" + ] + } + ], + "source": [ + "df['year'] = df['created'].dt.year\n", + "total_publications_per_year = df.groupby('year').size()\n", + "\n", + "yearly_change = total_publications_per_year.diff().dropna()\n", + "average_yearly_change = yearly_change.mean()\n", + "print(f\"Average yearly change in the number of publications: {average_yearly_change}\")\n", + "\n", + "# Initialize a dictionary to store the average change for each topic\n", + "topic_yearly_change = {}\n", + "\n", + "# Iterate over unique topics\n", + "for topic_num in df['topic_num'].unique():\n", + " # Filter the dataframe by the current topic\n", + " topic_df = df[df['topic_num'] == topic_num]\n", + "\n", + " # Count publications per year for the current topic\n", + " publications_per_year = topic_df.groupby('year').size()\n", + "\n", + " # Calculate the change in the number of publications per year and its mean\n", + " change = publications_per_year.diff().dropna()\n", + " topic_yearly_change[topic_num] = change.mean()\n", + "\n", + "# Determine topics whose average change exceeds the overall average\n", + "topics_above_average = {topic: change for topic, change in topic_yearly_change.items() if change > average_yearly_change}\n", + "\n", + "print(\"Topics with yearly change above the overall average:\")\n", + "for topic, change in topics_above_average.items():\n", + " print(f\"Topic #{topic} - Average yearly change: {change}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y80xQvMnER3C", + "outputId": "de8d62d8-3c9a-434c-dbf4-1510ef29d726" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thematic group 'Disasters & Emergency' - Total number of publications: 2450\n", + "Thematic group 'Air Pollution' - Total number of publications: 317\n", + "Thematic group 'Water Pollution' - Total number of publications: 2328\n", + "Thematic group 'Household Waste' - Total number of publications: 3333\n", + "Thematic group 'Infrared & Thermal' - Total number of publications: 759\n", + "Thematic group 'Agriculture Mapping' - Total number of publications: 1369\n", + "Total number of publications for all thematic groups: 10556\n" + ] + } + ], + "source": [ + "# Create a full list of all topics\n", + "all_topics = [topic for topics in topic_groups.values() for topic in topics]\n", + "\n", + "# Filter df_filtered to include only topics from all_topics\n", + "df_relevant = df_filtered[df_filtered['topic_num'].isin(all_topics)]\n", + "\n", + "# Count the total number of publications for each thematic group\n", + "group_counts = {group: df_relevant[df_relevant['topic_num'].isin(topics)]['topic_num'].count()\n", + " for group, topics in topic_groups.items()}\n", + "\n", + "# Output the total number of publications for each thematic group\n", + "for group, count in group_counts.items():\n", + " print(f\"Thematic group '{group}' - Total number of publications: {count}\")\n", + "\n", + "# Count the total number of publications for all topics\n", + "total_posts = df_relevant['topic_num'].count()\n", + "\n", + "# Output the total number of publications for all thematic groups\n", + "print(f\"Total number of publications for all thematic groups: {total_posts}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 660 + }, + "id": "ftwckpargdjZ", + "outputId": "ed5e6eae-68bf-47a4-bdc9-a5dac0c164fd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", + "\n", + "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", + "Running on public URL: https://dcb7ea4c0d4b942782.gradio.live\n", + "\n", + "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keyboard interruption in main thread... closing server.\n", + "Killing tunnel 127.0.0.1:7860 <> https://dcb7ea4c0d4b942782.gradio.live\n" + ] + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gradio as gr\n", + "\n", + "def get_info(input_value):\n", + " try:\n", + " # Assume input is a topic number\n", + " topic_num = int(input_value)\n", + " html_table, plot = get_topic_analysis(topic_num)\n", + " except ValueError:\n", + " # Input is not a topic number, assume it's a group name\n", + " html_table, plot = get_group_analysis(input_value)\n", + " return html_table, plot\n", + "\n", + "def get_topic_analysis(topic_num):\n", + " topic_group = df_filtered[df_filtered['topic_num'] == topic_num]['topic_group'].iloc[0]\n", + " topic_data = publications_per_topic_year.loc[topic_num]\n", + " topic_growth = growth_per_topic.loc[topic_num]\n", + " topic_relative_growth = relative_growth_per_topic.loc[topic_num] * 100\n", + " topic_growth_acceleration = topic_growth.diff().fillna(0) # Расчет ускорения роста\n", + "\n", + " topic_analysis = pd.DataFrame({\n", + " 'Year': topic_data.index,\n", + " 'Number of Publications': topic_data.values,\n", + " 'Change in Number of Publications': topic_growth.values,\n", + " 'Growth Acceleration': topic_growth_acceleration.values, # Добавление ускорения роста\n", + " 'Relative Growth': topic_relative_growth.values\n", + " }).set_index('Year')\n", + "\n", + " topic_analysis = topic_analysis.reset_index()\n", + " topic_analysis = topic_analysis.round(2)\n", + " html_table = topic_analysis.to_html(classes=\"table table-striped\", justify=\"left\", border=0)\n", + "\n", + " plot = visualize_trend_analysis(topic_data.index, topic_data.values, topic_growth.values, topic_relative_growth.values, f'Topic {topic_num} ({topic_group})')\n", + "\n", + " return html_table, plot\n", + "\n", + "def get_group_analysis(group_name):\n", + " topic_nums = topic_groups[group_name]\n", + " df_group = df_filtered[df_filtered['topic_num'].isin(topic_nums)]\n", + " group_publications_per_year = df_group.groupby('year')['topic_num'].count()\n", + "\n", + " changes = group_publications_per_year.diff().fillna(0)\n", + " relative_growth = (changes / group_publications_per_year.shift(1) * 100).fillna(0)\n", + " growth_acceleration = changes.diff().fillna(0) # Расчет ускорения роста\n", + "\n", + " group_analysis = pd.DataFrame({\n", + " 'Year': group_publications_per_year.index,\n", + " 'Number of Publications': group_publications_per_year.values,\n", + " 'Change in Number of Publications': changes.values,\n", + " 'Growth Acceleration': growth_acceleration.values, # Добавление ускорения роста\n", + " 'Relative Growth': relative_growth.values\n", + " }).set_index('Year')\n", + "\n", + " group_analysis = group_analysis.reset_index()\n", + " group_analysis = group_analysis.round(2)\n", + " html_table = group_analysis.to_html(classes=\"table table-striped\", justify=\"left\", border=0)\n", + "\n", + " plot = visualize_trend_analysis(group_publications_per_year.index, group_publications_per_year.values, changes.values, relative_growth.values, f'Group: {group_name}')\n", + "\n", + " return html_table, plot\n", + "\n", + "\n", + "def visualize_trend_analysis(years, publications, changes, relative_growth, title):\n", + " fig = go.Figure()\n", + " fig.add_trace(go.Scatter(x=years, y=publications, mode='lines+markers', name='Number of Publications', marker=dict(size=8, color='blue')))\n", + " fig.add_trace(go.Bar(x=years, y=changes, name='Change in Number of Publications', marker_color='orange', opacity=0.6))\n", + " fig.add_trace(go.Scatter(x=years, y=relative_growth, mode='lines', name='Relative Growth (%)', yaxis='y2', line=dict(color='green', width=2, dash='dash')))\n", + "\n", + " fig.update_layout(\n", + " title=f'Trend Analysis for {title}',\n", + " xaxis_title='Year',\n", + " yaxis_title='Number of Publications',\n", + " yaxis2=dict(title='Relative Growth (%)', overlaying='y', side='right', range=[-100, 100]),\n", + " legend=dict(x=1.05, y=1, traceorder='reversed', font_size=16),\n", + " barmode='overlay',\n", + " template='plotly_white'\n", + " )\n", + "\n", + " return fig\n", + "\n", + "def get_available_topics_and_groups():\n", + " # Creating a list of all topics\n", + " all_topics = [topic for topics in topic_groups.values() for topic in topics]\n", + "\n", + " # Filtering df_filtered to include only topics from all_topics\n", + " df_relevant = df_filtered[df_filtered['topic_num'].isin(all_topics)]\n", + "\n", + " # Counting the total number of publications for each thematic group\n", + " group_counts = {group: df_relevant[df_relevant['topic_num'].isin(topics)]['topic_num'].count()\n", + " for group, topics in topic_groups.items()}\n", + "\n", + " # Counting the total number of publications for each topic\n", + " topic_counts = df_relevant['topic_num'].value_counts().sort_index()\n", + "\n", + " # Generating the summary information\n", + " summary = \"Available Topics and Thematic Groups:

\"\n", + " summary += \"Thematic Groups:
\"\n", + " for group, count in group_counts.items():\n", + " summary += f\"- {group}: {count} publications
\"\n", + "\n", + " summary += \"
Topics:
\"\n", + " for topic, count in topic_counts.items():\n", + " summary += f\"- Topic {topic}: {count} publications
\"\n", + "\n", + " return summary\n", + "\n", + "# Get the available topics and groups information\n", + "available_info = get_available_topics_and_groups()\n", + "\n", + "# Modify the description to include the available topics and groups information\n", + "description = \"\"\"\n", + "Enter a topic number or a thematic group name to get information.

\n", + "\"\"\" + available_info\n", + "\n", + "iface = gr.Interface(\n", + " fn=get_info,\n", + " inputs=gr.Textbox(label=\"Topic Number or Thematic Group Name\"),\n", + " outputs=[\n", + " gr.HTML(label=\"Information\"),\n", + " gr.Plot(label=\"Trend Analysis\")\n", + " ],\n", + " title=\"Topic and Thematic Group Analysis\",\n", + " description=description\n", + ")\n", + "\n", + "iface.launch(debug=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-D_A_GXZpI-_" + }, + "source": [ + "### Operations with topics and documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LqKs2rLl_JZ1" + }, + "outputs": [], + "source": [ + "model2 = Top2Vec.load(\"/content/top2vec_model_arxiv_cs_from2010to2024-01-01\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Rzrt_dwavaU9", + "outputId": "edfd37c2-8691-4723-f218-f5666642a8a5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching for documents on the topic\n", + "==============================\n", + "Document ID: 252742, Similarity: 0.880\n", + "------------------------------\n", + "due to the advantages of flexible deployment and extensive coverage, unmannedaerial vehicles (uavs) have great potential for sensing applications in thenext generation of cellular networks, which will give rise to a cellularinternet of uavs. in this paper, we consider a cellular internet of uavs, wherethe uavs execute sensing tasks through cooperative sensing and transmission tominimize the age of information (aoi). however, the cooperative sensing andtransmission is tightly coupled with the uavs' trajectories, which makes thetrajectory design challenging. to tackle this challenge, we propose adistributed sense-and-send protocol, where the uavs determine the trajectoriesby selecting from a discrete set of tasks and a continuous set of locations forsensing and transmission. based on this protocol, we formulate the trajectorydesign problem for aoi minimization and propose a compound-action actor-critic(ca2c) algorithm to solve it based on deep reinforcement learning. the ca2calgorithm can learn the optimal policies for actions involving both continuousand discrete variables and is suited for the trajectory design. {our simulationresults show that the ca2c algorithm outperforms four baseline algorithms}.also, we show that by dividing the tasks, cooperative uavs can achieve a loweraoi compared to non-cooperative uavs.\n", + "------------------------------\n", + "\n", + "Document ID: 231982, Similarity: 0.876\n", + "------------------------------\n", + "cellular-connected unmanned aerial vehicle (uav) is a promising technology tounlock the full potential of uavs in the future. however, how to achieveubiquitous three-dimensional (3d) communication coverage for the uavs in thesky is a new challenge. in this paper, we tackle this challenge by a newcoverage-aware navigation approach, which exploits the uav's controllablemobility to design its navigation/trajectory to avoid the cellular bss'coverage holes while accomplishing their missions. we formulate an uavtrajectory optimization problem to minimize the weighted sum of its missioncompletion time and expected communication outage duration, and propose a newsolution approach based on the technique of deep reinforcement learning (drl).to further improve the performance, we propose a new framework calledsimultaneous navigation and radio mapping (snarm), where the uav's signalmeasurement is used not only for training the deep q network (dqn) directly,but also to create a radio map that is able to predict the outage probabilitiesat all locations in the area of interest. this thus enables the generation ofsimulated uav trajectories and predicting their expected returns, which arethen used to further train the dqn via dyna technique, thus greatly improvingthe learning efficiency.\n", + "------------------------------\n", + "\n", + "Document ID: 338980, Similarity: 0.874\n", + "------------------------------\n", + "this paper studies the uav-enabled integrated sensing and communication(isac), in which uavs are dispatched as aerial dual-functional access points(aps) for efficient isac. in particular, we consider a scenario with one uav-apequipped with a vertically placed uniform linear array (ula), which sendscombined information and sensing signals to communicate with multiple users andsense potential targets at interested areas on the ground simultaneously. ourobjective is to jointly design the uav maneuver with the transmit beamformingfor optimizing the communication performance while ensuring the sensingrequirements. first, we consider the quasi-stationary uav scenario, in whichthe uav is deployed at an optimizable location over the whole isac missionperiod. in this case, we jointly optimize the uav deployment location, as wellas the transmit information and sensing beamforming to maximize the weightedsum-rate throughput, subject to the sensing beampattern gain requirements andtransmit power constraint. although the above problem is non-convex, we find ahigh-quality solution by using the techniques of sca and sdr, together with a2d location search. next, we consider the fully mobile uav scenario, in whichthe uav can fly over different locations during the isac mission period. inthis case, we optimize the uav flight trajectory, jointly with the transmitbeamforming over time, to maximize the average weighted sum-rate throughput,subject to the sensing beampattern gain requirements and transmit powerconstraints as well as practical flight constraints. while the joint uavtrajectory and beamforming problem is more challenging to solve, we propose anefficient algorithm by adopting the alternating optimization together with sca.finally, numerical results are provided to validate the superiority of ourproposed designs as compared to various benchmark schemes.\n", + "------------------------------\n", + "\n" + ] + } + ], + "source": [ + "#@title Search for documents on a specified topic\n", + "\n", + "print(\"Searching for documents on the topic\")\n", + "print(\"=\" * 30)\n", + "\n", + "# Performing a search for documents on the specified topic\n", + "# In this case, we are looking for 3 documents related to topic number 100\n", + "topic_number = 100\n", + "number_of_documents = 3\n", + "documents, document_scores, document_ids = model2.search_documents_by_topic(topic_num=topic_number, num_docs=number_of_documents)\n", + "\n", + "# Printing the search results\n", + "for doc, score, doc_id in zip(documents, document_scores, document_ids):\n", + " print(f\"Document ID: {doc_id}, Similarity: {score:.3f}\")\n", + " print(\"-\" * 30)\n", + " print(doc)\n", + " print(\"-\" * 30)\n", + " print()\n", + "\n", + "# Variable descriptions:\n", + "# documents: a list of documents, ordered from most to least similar to the topic.\n", + "# document_scores: semantic similarity scores of documents to the topic, measured as cosine similarity between document and topic vectors.\n", + "# document_ids: unique identifiers for the documents. If identifiers were not provided, the index of the document in the original corpus is used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "px-mrmodv2p8", + "outputId": "692183e4-980e-4f91-d460-11c355f8a2a0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching for documents using the keywords 'cryptography' and 'privacy'\n", + "==================================================\n", + "Document ID: 516146, Similarity: 0.408\n", + "------------------------------\n", + "small unmanned aerial vehicles (uavs) are becoming potential threats tosecurity-sensitive areas and personal privacy. a uav can shoot photos atheight, but how to detect such an uninvited intruder is an open problem. thispaper presents mmhawkeye, a passive approach for uav detection with a cotsmillimeter wave (mmwave) radar. mmhawkeye doesn't require prior knowledge ofthe type, motions, and flight trajectory of the uav, while exploiting thesignal feature induced by the uav's periodic micro-motion (pmm) for long-rangeaccurate detection. the design is therefore effective in dealing with low-snrand uncertain reflected signals from the uav. mmhawkeye can further track theuav's position with dynamic programming and particle filtering, and identify itwith a long short-term memory (lstm) based detector. we implement mmhawkeye ona commercial mmwave radar and evaluate its performance under varied settings.the experimental results show that mmhawkeye has a detection accuracy of 95.8%and can realize detection at a range up to 80m.\n", + "------------------------------\n", + "\n", + "Document ID: 545867, Similarity: 0.402\n", + "------------------------------\n", + "unmanned aerial vehicles (uavs) are widely exploited in environmentmonitoring, search-and-rescue, etc. however, the mobility and short flightduration of uavs bring challenges for uav networking. in this paper, we studythe uav networks with n uavs acting as aerial sensors. uavs generally haveshort flight duration and need to frequently get energy replenishment from thecontrol station. hence the returning uavs bring the data of the uavs along thereturning paths to the control station with a store-carry-and-forward (scf)mode. a critical range for the distance between the uav and the control stationis discovered. within the critical range, the per-node capacity of the scf modeis o(n/log n) times higher than that of the multi-hop mode. however, theper-node capacity of the scf mode outside the critical range decreases with thedistance between the uav and the control station. to eliminate the criticalrange, a mobility control scheme is proposed such that the capacity scalinglaws of the scf mode are the same for all uavs, which improves the capacityperformance of uav networks. moreover, the delay of the scf mode is derived.the impact of the size of the entire region, the velocity of uavs, the numberof uavs and the flight duration of uavs on the delay of scf mode is analyzed.this paper reveals that the mobility and short flight duration of uavs havebeneficial effects on the performance of uav networks, which may motivate thestudy of scf schemes for uav networks.\n", + "------------------------------\n", + "\n", + "Document ID: 552174, Similarity: 0.402\n", + "------------------------------\n", + "reliable deployment of unmanned aerial vehicles (uavs) in cluttered unknownenvironments requires accurate sensors for obstacle avoidance. such arequirement limits the usage of cheap and micro-scale vehicles with constrainedpayload capacity if industrial-grade reliability and precision are required.this paper investigates the possibility of offloading the necessity to carryheavy and expensive obstacle sensors to another member of the uav team whilepreserving the desired obstacle avoidance capability. a novel cooperativeguidance framework offloading the obstacle sensing requirements from aminimalistic secondary uav to a superior primary uav is proposed. the primaryuav constructs a dense occupancy map of the environment and planscollision-free paths for both uavs to ensure reaching the desired secondaryuav's goal. the primary uav guides the secondary uav to follow the planned pathwhile tracking the uav using light detection and ranging (lidar)-based relativelocalization. the proposed approach was verified in real-world experiments witha heterogeneous team of a 3d lidar-equipped primary uav and a camera-equippedsecondary uav moving autonomously through unknown cluttered global navigationsatellite system (gnss)-denied environments with the proposed framework runningcompletely on board the uavs.\n", + "------------------------------\n", + "\n", + "Document ID: 480894, Similarity: 0.401\n", + "------------------------------\n", + "drones have become essential tools in a wide range of industries, includingagriculture, surveying, and transportation. however, tracking unmanned aerialvehicles (uavs) in challenging environments, such cluttered or gnss-deniedenvironments, remains a critical issue. additionally, uavs are being deployedas part of multi-robot systems, where tracking their position can be essentialfor relative state estimation. in this paper, we evaluate the performance of amulti-scan integration method for tracking uavs in gnss-denied environmentsusing a solid-state lidar and a kalman filter (kf). we evaluate the algorithm'sability to track a uav in a large open area at various distances and speeds.our quantitative analysis shows that while \"tracking by detection\" using aconstant velocity model is the only method that consistently tracks the target,integrating multiple scan frequencies using a kf achieves lower position errorsand represents a viable option for tracking uavs in similar scenarios.\n", + "------------------------------\n", + "\n", + "Document ID: 240208, Similarity: 0.400\n", + "------------------------------\n", + "an innovative method of detecting unmanned aerial vehicles (uavs) ispresented. the goal of this study is to develop a robust setup for anautonomous multi-rotor hunter uav, capable of visually detecting and trackingthe intruder uavs for real-time motion planning. the system consists of twoparts: object detection using a stereo camera to generate 3d point cloud dataand video tracking applying a kalman filter for uav motion modeling. afterdetection, the hunter can aim and shoot a tethered net at the intruder toneutralize it. the computer vision, motion tracking, and planning algorithmscan be implemented on a portable computer installed on the hunter uav.\n", + "------------------------------\n", + "\n" + ] + } + ], + "source": [ + "#@title Search across all documents\n", + "# Search for documents using the keywords 'cryptography' and 'privacy'\n", + "print(\"Searching for documents using the keywords 'cryptography' and 'privacy'\")\n", + "print(\"=\" * 50)\n", + "\n", + "# Keywords and number of documents to search for\n", + "keywords = [\"uav\", \"ecology\",\"radars\"]\n", + "num_docs_to_search = 5\n", + "\n", + "# Performing the search\n", + "documents, document_scores, document_ids = model2.search_documents_by_keywords(keywords=keywords, num_docs=num_docs_to_search)\n", + "\n", + "# Displaying the search results\n", + "for doc, score, doc_id in zip(documents, document_scores, document_ids):\n", + " print(f\"Document ID: {doc_id}, Similarity: {score:.3f}\")\n", + " print(\"-\" * 30)\n", + " print(doc)\n", + " print(\"-\" * 30)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4nQTXKZ0wQQl", + "outputId": "4e6a46b4-7f93-4aca-f4fa-4bdfba92379c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching for words similar to 'natural'\n", + "==============================\n", + "Word: unnatural, Similarity: 0.700\n", + "Word: naturally, Similarity: 0.664\n", + "Word: nature, Similarity: 0.622\n", + "Word: natures, Similarity: 0.615\n", + "Word: naturalness, Similarity: 0.577\n", + "\n", + "Searching for words similar to 'disasters'\n", + "==============================\n", + "Word: catastrophe, Similarity: 0.789\n", + "Word: disaster, Similarity: 0.787\n", + "Word: catastrophic, Similarity: 0.629\n", + "Word: disastrous, Similarity: 0.618\n", + "Word: fault, Similarity: 0.607\n", + "\n", + "Searching for words similar to 'airquality'\n", + "==============================\n", + "Word: pairedwith, Similarity: 0.695\n", + "Word: arobust, Similarity: 0.681\n", + "Word: thegeneral, Similarity: 0.661\n", + "Word: avirtual, Similarity: 0.661\n", + "Word: havesimilar, Similarity: 0.658\n", + "\n", + "Searching for words similar to 'hydrology'\n", + "==============================\n", + "Word: geophysics, Similarity: 0.666\n", + "Word: epidemiology, Similarity: 0.625\n", + "Word: geophysical, Similarity: 0.602\n", + "Word: rateless, Similarity: 0.581\n", + "Word: percolation, Similarity: 0.576\n", + "\n", + "Searching for words similar to 'ecology'\n", + "==============================\n", + "Word: ecological, Similarity: 0.731\n", + "Word: thecomparative, Similarity: 0.687\n", + "Word: ecologists, Similarity: 0.656\n", + "Word: ecologically, Similarity: 0.652\n", + "Word: tonumerical, Similarity: 0.646\n", + "\n", + "Searching for words similar to 'monitoring'\n", + "==============================\n", + "Word: surveillance, Similarity: 0.877\n", + "Word: monitored, Similarity: 0.821\n", + "Word: supervisory, Similarity: 0.800\n", + "Word: oversight, Similarity: 0.752\n", + "Word: supervision, Similarity: 0.738\n", + "\n", + "Searching for words similar to 'radars'\n", + "==============================\n", + "Word: radar, Similarity: 0.817\n", + "Word: lidar, Similarity: 0.618\n", + "Word: sonar, Similarity: 0.573\n", + "Word: cloaking, Similarity: 0.545\n", + "Word: speeding, Similarity: 0.539\n", + "\n", + "Searching for words similar to 'environment'\n", + "==============================\n", + "Word: surroundings, Similarity: 0.837\n", + "Word: environmental, Similarity: 0.831\n", + "Word: environments, Similarity: 0.769\n", + "Word: environmentally, Similarity: 0.674\n", + "Word: surrounding, Similarity: 0.630\n", + "\n" + ] + } + ], + "source": [ + "#@title Search for similar words\n", + "# List of initial keywords for investigation\n", + "initial_keywords = [\"natural\",\"disasters\", \"airquality\",\"hydrology\",\"ecology\", \"monitoring\", \"radars\", \"environment\"]\n", + "\n", + "# Number of words to search for each keyword\n", + "num_similar_words = 20\n", + "\n", + "# Iterating over the list of initial keywords\n", + "for keyword in initial_keywords:\n", + " print(f\"Searching for words similar to '{keyword}'\")\n", + " print(\"=\" * 30)\n", + "\n", + " # Performing the search for similar words\n", + " words, word_scores = model2.similar_words(keywords=[keyword], keywords_neg=[], num_words=num_similar_words)\n", + "\n", + " # Displaying only the first 5 found words and their similarity scores\n", + " for word, score in zip(words[:5], word_scores[:5]): # Limiting the output to the first 5 words\n", + " print(f\"Word: {word}, Similarity: {score:.3f}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "KB4SF1Ymu-F4" + }, + "outputs": [], + "source": [ + "#@title wordcloud\n", + "# topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[\"emissions\",\"disasters\", \"monitoring\",\"emergency\"], num_topics=2)\n", + "# for topic in topic_nums:\n", + "# model.generate_topic_wordcloud(topic)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fv999jMel9nl" + }, + "source": [ + "### Uploading to Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T9E3KEDbQ4P_", + "outputId": "2d79305c-14a9-4150-c6f3-d74295618b36" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cp: cannot stat '/content/arxiv_cs_from2010to2024-01-01': No such file or directory\n" + ] + } + ], + "source": [ + "!cp -r /content/arxiv_cs_from2010to2024-01-01 \"/content/drive/MyDrive/AI/top2vec\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 331, + "referenced_widgets": [ + "c9d24cf6623048ccbd1730e07c710295", + "48bad35847b1421fb3528281d08dd762", + "85fe6445c6394427ab27463378ff1971", + "0159a7428b24482f9ab1b954f5957992", + "106455fb3aac447e8372445d2e20b58a", + "9c59f80fb3294f14ab7bef89aba09e33", + "16bb49be5cb84a82b456c1ed663b9eed", + "9f4da8c6776c4590930b487e129a4713", + "13f09c51d28349c5a1013b5f620d6e52", + "75d589f952a6415aafadeb707130696b", + "183053e79c9e4b409acc428851723bcf", + "e851eeec67be42e9847386ffe13ec2e3", + "2d03beddb9e041f5ba76f6d424f5215a", + "4512edbb6bf64ad6b6940fdded580d87", + "1ead1f6214b44e4ca095ed6e3fb478ee", + "48ab400cced34d32aa72f976d829b303", + "69de7260e99a4a1992ae904bc0348637" + ] + }, + "id": "CvVPYBVMCYpI", + "outputId": "c847da2c-302e-4d4a-b065-072008285d6e" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c9d24cf6623048ccbd1730e07c710295", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mhuggingface_hub\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mHfApi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mapi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHfApi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m api.upload_file(\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"/content/drive/MyDrive/AI/top2vec/arxiv_cs_from2010to2024-01-01\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mpath_in_repo\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"top2vec_model_arxiv_cs_from2010to2024-01-01\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\u001b[0m in \u001b[0;36m_inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msmoothly_deprecate_use_auth_token\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhas_token\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhas_token\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 119\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_inner_fn\u001b[0m \u001b[0;31m# type: ignore\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\u001b[0m in \u001b[0;36m_inner\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1206\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1207\u001b[0m \u001b[0;31m# Otherwise, call the function normally\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1208\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1210\u001b[0m \u001b[0m_inner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_future_compatible\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m \u001b[0;31m# type: ignore\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/hf_api.py\u001b[0m in \u001b[0;36mupload_file\u001b[0;34m(self, path_or_fileobj, path_in_repo, repo_id, token, repo_type, revision, commit_message, commit_description, create_pr, parent_commit, run_as_future)\u001b[0m\n\u001b[1;32m 4241\u001b[0m \u001b[0mcommit_message\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcommit_message\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34mf\"Upload {path_in_repo} with huggingface_hub\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4242\u001b[0m )\n\u001b[0;32m-> 4243\u001b[0;31m operation = CommitOperationAdd(\n\u001b[0m\u001b[1;32m 4244\u001b[0m \u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4245\u001b[0m \u001b[0mpath_in_repo\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath_in_repo\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/_commit_api.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, path_in_repo, path_or_fileobj)\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/_commit_api.py\u001b[0m in \u001b[0;36m__post_init__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0mpath_or_fileobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpanduser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 164\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Provided path: '{path_or_fileobj}' is not a file on the local file system\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 165\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath_or_fileobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBufferedIOBase\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0;31m# ^^ Inspired from: https://stackoverflow.com/questions/44584829/how-to-determine-if-file-is-opened-in-binary-or-text-mode\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Provided path: '/content/drive/MyDrive/AI/top2vec/arxiv_cs_from2010to2024-01-01' is not a file on the local file system" + ] + } + ], + "source": [ + "from huggingface_hub import HfApi\n", + "api = HfApi()\n", + "api.upload_file(\n", + " path_or_fileobj=\"/content/drive/MyDrive/AI/top2vec/arxiv_cs_from2010to2024-01-01\", #\n", + " path_in_repo=\"top2vec_model_arxiv_cs_from2010to2024-01-01\", #\n", + " repo_id=\"CCRss/topic_modeling_temp_name\", #\n", + " repo_type=\"model\", #\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8WTU96QwLYOM" + }, + "outputs": [], + "source": [ + "from huggingface_hub import HfApi\n", + "import os\n", + "\n", + "api = HfApi()\n", + "folder_path = \"/content/my_model_dir\"\n", + "repo_id = \"CCRss/top2vec_science_abstracts\"\n", + "folder_in_repo = \"BERTopic_model/\"\n", + "\n", + "\n", + "for filename in os.listdir(folder_path):\n", + " file_path = os.path.join(folder_path, filename)\n", + " if os.path.isfile(file_path):\n", + " api.upload_file(\n", + " path_or_fileobj=file_path,\n", + " path_in_repo=folder_in_repo + filename,\n", + " repo_id=repo_id,\n", + " repo_type=\"model\"\n", + " )" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "kxEOZR29sOv2" + ], + "machine_shape": "hm", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0159a7428b24482f9ab1b954f5957992": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_e851eeec67be42e9847386ffe13ec2e3", + "style": "IPY_MODEL_2d03beddb9e041f5ba76f6d424f5215a", + "value": true + } + }, + "106455fb3aac447e8372445d2e20b58a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_4512edbb6bf64ad6b6940fdded580d87", + "style": "IPY_MODEL_1ead1f6214b44e4ca095ed6e3fb478ee", + "tooltip": "" + } + }, + "13f09c51d28349c5a1013b5f620d6e52": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "16bb49be5cb84a82b456c1ed663b9eed": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "183053e79c9e4b409acc428851723bcf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1ead1f6214b44e4ca095ed6e3fb478ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "2d03beddb9e041f5ba76f6d424f5215a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4512edbb6bf64ad6b6940fdded580d87": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48ab400cced34d32aa72f976d829b303": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48bad35847b1421fb3528281d08dd762": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f4da8c6776c4590930b487e129a4713", + "placeholder": "​", + "style": "IPY_MODEL_13f09c51d28349c5a1013b5f620d6e52", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "69de7260e99a4a1992ae904bc0348637": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "75d589f952a6415aafadeb707130696b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "85fe6445c6394427ab27463378ff1971": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_75d589f952a6415aafadeb707130696b", + "placeholder": "​", + "style": "IPY_MODEL_183053e79c9e4b409acc428851723bcf", + "value": "" + } + }, + "9c59f80fb3294f14ab7bef89aba09e33": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_48ab400cced34d32aa72f976d829b303", + "placeholder": "​", + "style": "IPY_MODEL_69de7260e99a4a1992ae904bc0348637", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks.
" + } + }, + "9f4da8c6776c4590930b487e129a4713": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c9d24cf6623048ccbd1730e07c710295": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_48bad35847b1421fb3528281d08dd762", + "IPY_MODEL_85fe6445c6394427ab27463378ff1971", + "IPY_MODEL_0159a7428b24482f9ab1b954f5957992", + "IPY_MODEL_106455fb3aac447e8372445d2e20b58a", + "IPY_MODEL_9c59f80fb3294f14ab7bef89aba09e33" + ], + "layout": "IPY_MODEL_16bb49be5cb84a82b456c1ed663b9eed" + } + }, + "e851eeec67be42e9847386ffe13ec2e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}