diff --git "a/src/models/tf-idf.ipynb" "b/src/models/tf-idf.ipynb" new file mode 100644--- /dev/null +++ "b/src/models/tf-idf.ipynb" @@ -0,0 +1,1529 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-07-08 08:44:29.719980: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.pipeline import Pipeline\n", + "import spacy\n", + "\n", + "from typing import Union\n", + "from collections import Counter\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "abstract | \n", + "versions | \n", + "update_date | \n", + "authors_parsed | \n", + "cleaned_abstracts | \n", + "len_abstract | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
908956 | \n", + "2301.08674 | \n", + "Sergio Pezzini | \n", + "Daniel Vaquero, Vito Cleric\\`o, Michael Schmit... | \n", + "Phonon-mediated room-temperature quantum Hall ... | \n", + "17 pages, 4 figures. Supplementary information... | \n", + "Nature Communications 14, 318 (2023) | \n", + "10.1038/s41467-023-35986-3 | \n", + "None | \n", + "cond-mat.mes-hall | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "The quantum Hall (QH) effect in two-dimensio... | \n", + "[{'created': 'Fri, 20 Jan 2023 16:53:31 GMT', ... | \n", + "2023-01-23 | \n", + "[[Vaquero, Daniel, ], [Clericò, Vito, ], [Schm... | \n", + "quantum hall qh effect dimensional electron... | \n", + "842 | \n", + "
\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "abstract | \n", + "versions | \n", + "update_date | \n", + "authors_parsed | \n", + "cleaned_abstracts | \n", + "len_abstract | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "cond-mat/0509555 | \n", + "Ricardo Brito | \n", + "M.H.Ernst and R. Brito | \n", + "Generalized Green-Kubo formulas for fluids wit... | \n", + "14 pages, no figures. Version 2: expanded Intr... | \n", + "Phys. Rev. E 72 (2006) 061102 | \n", + "10.1103/PhysRevE.72.061102 | \n", + "None | \n", + "cond-mat.stat-mech | \n", + "None | \n", + "We present a generalization of the Green-Kub... | \n", + "[{'created': 'Wed, 21 Sep 2005 17:18:42 GMT', ... | \n", + "2007-05-23 | \n", + "[[Ernst, M. H., ], [Brito, R., ]] | \n", + "present generalization green kubo expressio... | \n", + "814 | \n", + "
1 | \n", + "cond-mat/0204489 | \n", + "Jizhong Lou | \n", + "Jizhong Lou, Shaojin Qin, Changfeng Chen, Zhao... | \n", + "Field-induced gap in the spin-1/2 antiferromag... | \n", + "8 pages, 6 figures | \n", + "Phys. Rev. B 65, 064420 (2002) | \n", + "10.1103/PhysRevB.65.064420 | \n", + "None | \n", + "cond-mat.str-el | \n", + "None | \n", + "We study the spin-1/2 antiferromagnetic Heis... | \n", + "[{'created': 'Tue, 23 Apr 2002 14:28:46 GMT', ... | \n", + "2009-11-07 | \n", + "[[Lou, Jizhong, ], [Qin, Shaojin, ], [Chen, Ch... | \n", + "study spin-1/2 antiferromagnetic heisenberg... | \n", + "856 | \n", + "
2 | \n", + "0708.0723 | \n", + "Salvatore Capozziello | \n", + "S. Capozziello, A. Stabile, A. Troisi | \n", + "The Newtonian Limit of F(R) gravity | \n", + "16 pages | \n", + "Phys.Rev.D76:104019,2007 | \n", + "10.1103/PhysRevD.76.104019 | \n", + "None | \n", + "gr-qc astro-ph | \n", + "None | \n", + "A general analytic procedure is developed to... | \n", + "[{'created': 'Mon, 6 Aug 2007 08:46:02 GMT', '... | \n", + "2008-11-26 | \n", + "[[Capozziello, S., ], [Stabile, A., ], [Troisi... | \n", + "general analytic procedure develop deal new... | \n", + "821 | \n", + "
3 | \n", + "1312.3113 | \n", + "Dmitry Shcherbakov M.Sc. | \n", + "Dmitry Shcherbakov, Matthias Ehrhardt, Michael... | \n", + "Force-Gradient Nested Multirate Methods for Ha... | \n", + "16 pages 3 figures, submitted to Applied Numer... | \n", + "None | \n", + "None | \n", + "None | \n", + "math.NA math-ph math.MP quant-ph | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "Force-gradient decomposition methods are use... | \n", + "[{'created': 'Wed, 11 Dec 2013 11:08:03 GMT', ... | \n", + "2013-12-12 | \n", + "[[Shcherbakov, Dmitry, ], [Ehrhardt, Matthias,... | \n", + "force gradient decomposition method improve... | \n", + "535 | \n", + "
4 | \n", + "1607.01234 | \n", + "Yadao Solanke | \n", + "Y. S. Solanke, D. D.Pawar and V. J. Dagwal | \n", + "Role of the constant deceleration parameter in... | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "gr-qc | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "The main purpose of the present paper is to ... | \n", + "[{'created': 'Tue, 5 Jul 2016 12:58:00 GMT', '... | \n", + "2020-03-06 | \n", + "[[Solanke, Y. S., ], [Pawar, D. D., ], [Dagwal... | \n", + "main purpose present paper investigate lrs ... | \n", + "557 | \n", + "
\n", + " | 00 | \n", + "000 | \n", + "000001 | \n", + "000002 | \n", + "00002 | \n", + "000025 | \n", + "00007 | \n", + "00009 | \n", + "0001 | \n", + "0001106 | \n", + "... | \n", + "zy | \n", + "zygmund | \n", + "zygoma | \n", + "zygomatic | \n", + "zygote | \n", + "zykin | \n", + "zytkow | \n", + "zz | \n", + "zzgamma | \n", + "zzz | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
1 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
2 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
3 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
4 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
29995 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29996 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29997 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29998 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29999 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
30000 rows × 56862 columns
\n", + "\n", + " | id | \n", + "submitter | \n", + "authors | \n", + "title | \n", + "comments | \n", + "journal-ref | \n", + "doi | \n", + "report-no | \n", + "categories | \n", + "license | \n", + "... | \n", + "zy | \n", + "zygmund | \n", + "zygoma | \n", + "zygomatic | \n", + "zygote | \n", + "zykin | \n", + "zytkow | \n", + "zz | \n", + "zzgamma | \n", + "zzz | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1109.5512 | \n", + "Keita Owari | \n", + "Keita Owari | \n", + "On Admissible Strategies in Robust Utility Max... | \n", + "Forthcoming in Mathematics and Financial Econo... | \n", + "Mathematics and Financial Economics, Vol. 6, N... | \n", + "10.1007/s11579-012-0068-3 | \n", + "None | \n", + "q-fin.PM math.PR | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
1 | \n", + "cs/0403015 | \n", + "Ichiro Adachi | \n", + "Ichiro Adachi, Taisuke Hibino, Luc Hinz, Ryosu... | \n", + "Belle Computing System | \n", + "6 pages, 6 figures, talk given at ACAT03, Tsuk... | \n", + "Nucl.Instrum.Meth. A534 (2004) 53-58 | \n", + "10.1016/j.nima.2004.07.058 | \n", + "None | \n", + "cs.DC | \n", + "None | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
2 | \n", + "1706.05858 | \n", + "Zheng Xie | \n", + "Zheng Xie, Miao Li, Jianping Li, Xiaojun Duan,... | \n", + "Feature analysis of multidisciplinary scientif... | \n", + "None | \n", + "Xie, Z., Li, M., Li, J., Duan, X., & Ouyang, Z... | \n", + "10.1140/epjds/s13688-018-0134-z | \n", + "None | \n", + "physics.soc-ph cs.DL | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
3 | \n", + "1605.09027 | \n", + "Roland Duduchava Prof. | \n", + "Tengiz Buchukuri, Roland Duduchava, George Tep... | \n", + "Laplace-Beltrami equation on hypersurfaces and... | \n", + "38 pages, 2 figures | \n", + "None | \n", + "None | \n", + "None | \n", + "math-ph math.MP | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
4 | \n", + "1110.4381 | \n", + "Jian-Xin Zhu Dr. | \n", + "Jian-Xin Zhu, Jean-Pierre Julien, Y. Dubi, A. ... | \n", + "Local Electronic Structure and Fano Interferen... | \n", + "4+ pages, 3 eps figures, revised version with ... | \n", + "None | \n", + "10.1103/PhysRevLett.108.186401 | \n", + "LA-UR 11-05833 | \n", + "cond-mat.str-el | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
29995 | \n", + "1411.7041 | \n", + "Xi Dong | \n", + "Ahmed Almheiri, Xi Dong, Daniel Harlow | \n", + "Bulk Locality and Quantum Error Correction in ... | \n", + "30 pages plus appendices, 10 figures. v2: sect... | \n", + "JHEP 1504:163,2015 | \n", + "10.1007/JHEP04(2015)163 | \n", + "SU-ITP-14/30 | \n", + "hep-th gr-qc quant-ph | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29996 | \n", + "1408.4854 | \n", + "Vitaly Vanchurin | \n", + "Vitaly Vanchurin | \n", + "The Continuum of Discrete Trajectories in Eter... | \n", + "12 pages | \n", + "Phys.Rev. D91 (2015) no.2, 02351 | \n", + "10.1103/PhysRevD.91.023511 | \n", + "None | \n", + "hep-th gr-qc hep-ph | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29997 | \n", + "1008.2215 | \n", + "Andrey Katz | \n", + "Daniel Green, Andrey Katz, and Zohar Komargodski | \n", + "Direct Gaugino Mediation | \n", + "26 pages, 1 figure. v2: minor corrections, ref... | \n", + "Phys.Rev.Lett.106:061801,2011 | \n", + "10.1103/PhysRevLett.106.061801 | \n", + "UMD-PP-10-013 | \n", + "hep-th hep-ph | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29998 | \n", + "1605.05516 | \n", + "Gabriel Riviere | \n", + "Nguyen Viet Dang (ICJ), Gabriel Riviere (LPP) | \n", + "Spectral analysis of morse-smale gradient flows | \n", + "Shortened version (56 p.), to appear in Annale... | \n", + "None | \n", + "None | \n", + "None | \n", + "math.DS math.GT math.SP | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
29999 | \n", + "2109.02110 | \n", + "Dingshun Lv | \n", + "Changsu Cao, Jiaqi Hu, Wengang Zhang, Xusheng ... | \n", + "Towards a Larger Molecular Simulation on the Q... | \n", + "12 pages, 9 figures, 2 tables | \n", + "None | \n", + "10.1103/PhysRevA.105.062452 | \n", + "None | \n", + "quant-ph | \n", + "http://arxiv.org/licenses/nonexclusive-distrib... | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "
30000 rows × 56878 columns
\n", + "