diff --git "a/Tutorials/1_Fill-Mask.html" "b/Tutorials/1_Fill-Mask.html" deleted file mode 100644--- "a/Tutorials/1_Fill-Mask.html" +++ /dev/null @@ -1,14496 +0,0 @@ - - -
- - -# Use the trained astroBERT model with the fill-mask pipeline
-
# 1 - load models and tokenizer
-
from transformers import AutoTokenizer, BertForMaskedLM
-
2022-10-14 15:27:35.809315: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0 --
# the model path can either be the name of the Huggingface repository
-remote_model_path = 'adsabs/astroBERT'
-# or the local path to the directory containing model weight and tokenizer vocab
-local_model_path = '../'
-
# make sure you load the tokenier with do_lower_case=False
-astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,
- use_auth_token=True,
- add_special_tokens=False,
- do_lower_case=False,
- )
-
astroBERT_automodel_for_mlm = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path=remote_model_path,
- use_auth_token=True,
- )
-
Some weights of the model checkpoint at adsabs/astroBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight'] -- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --
# for pipeline to work you have to ensure that the model returns a dict
-astroBERT_automodel_for_mlm.config.return_dict=True
-
from transformers import FillMaskPipeline
-
astroBERT_pipeline = FillMaskPipeline(model=astroBERT_automodel_for_mlm,
- tokenizer=astroBERT_tokenizer,
- task='fill-mask',
- )
-
clean_sentences = ['M67 is one of the most studied open clusters.',
-'A solar twin is a star with atmospheric parameters and chemical composition very similar to our Sun.',
-'The dynamical evolution of planets close to their star is affected by tidal effects',
-'The Kepler satellite collected high-precision long-term and continuous light curves for more than 100,000 solar-type stars',
-'The Local Group is composed of the Milky Way, the Andromeda Galaxy, and numerous smaller satellite galaxies.',
-'Cepheid variables are used to determine the distances to galaxies in the local universe.',
-'Jets are created and sustained by accretion of matter onto a compact massive object.',
-'A single star of one solar mass will evolve into a white dwarf.',
-'The Very Large Array observes the sky at radio wavelengths.',
-'Elements heavier than iron are generated in supernovae explosions.',
-'Spitzer was the first spacecraft to fly in an Earth-trailing orbit.',
-'Galaxy mergers can occur when two (or more) galaxies collide',
-'Dark matter is a hypothetical form of matter thought to account for approximately 85% of the matter in the universe.',
-'The cosmic microwave background (CMB, CMBR), in Big Bang cosmology, is electromagnetic radiation which is a remnant from an early stage of the universe.',
-'The Local Group of galaxies is pulled toward The Great Attractor.',
-'The Moon is the only satellite of the Earth.',
-'Galaxies are categorized according to their visual morphology as elliptical, spiral, or irregular.',
-'Stars are made mostly of hydrogen.',
-'Comet tails are created as comets approach the Sun.',
-'Pluto is a dwarf planet in the Kuiper Belt.',
-'The Large and Small Magellanic Clouds are irregular dwarf galaxies and are two satellite galaxies of the Milky Way.',
-'The Milky Way has a supermassive black hole, Sagittarius A*, at its center.',
-'Andromeda is the nearest large galaxy to the Milky Way and is roughly its equal in mass.',
-'The interstellar medium is the gas and dust between stars.']
-
masked_sentences = ['M67 is one of the most studied [MASK] clusters.',
-'A solar twin is a star with [MASK] parameters and chemical composition very similar to our Sun.',
-'The dynamical evolution of planets close to their star is affected by [MASK] effects',
-'The Kepler satellite collected high-precision long-term and continuous light [MASK] for more than 100,000 solar-type stars',
-'The Local Group is composed of the Milky Way, the [MASK] Galaxy, and numerous smaller satellite galaxies.',
-'Cepheid variables are used to determine the [MASK] to galaxies in the local universe.',
-'Jets are created and sustained by [MASK] of matter onto a compact massive object.',
-'A single star of one solar mass will evolve into a [MASK] dwarf.',
-'The Very Large Array observes the sky at [MASK] wavelengths.',
-'Elements heavier than [MASK] are generated in supernovae explosions.',
-'Spitzer was the first [MASK] to fly in an Earth-trailing orbit.',
-'Galaxy [MASK] can occur when two (or more) galaxies collide',
-'Dark [MASK] is a hypothetical form of matter thought to account for approximately 85% of the matter in the universe.',
-'The cosmic microwave background (CMB, CMBR), in Big Bang cosmology, is electromagnetic radiation which is a remnant from an early stage of the [MASK].',
-'The Local Group of galaxies is pulled toward The Great [MASK].',
-'The Moon is the only [MASK] of the Earth.',
-'Galaxies are categorized according to their visual morphology as [MASK], spiral, or irregular.',
-'Stars are made mostly of [MASK].',
-'Comet tails are created as comets approach the [MASK].',
-'Pluto is a dwarf [MASK] in the Kuiper Belt.',
-'The Large and Small Magellanic Clouds are irregular [MASK] galaxies and are two satellite galaxies of the Milky Way.',
-'The Milky Way has a [MASK] black hole, Sagittarius A*, at its center.',
-'Andromeda is the nearest large [MASK] to the Milky Way and is roughly its equal in mass.',
-'The [MASK] medium is the gas and dust between stars.']
-
masked_words = [x for s1,s2 in zip(clean_sentences, masked_sentences)
- for x,y in zip(s1.split(), s2.split()) if y=='[MASK]']
-
masked_words
-
['open', - 'atmospheric', - 'tidal', - 'curves', - 'Andromeda', - 'distances', - 'accretion', - 'white', - 'radio', - 'iron', - 'spacecraft', - 'mergers', - 'matter', - 'satellite', - 'planet', - 'dwarf', - 'supermassive', - 'galaxy', - 'interstellar']-
results = astroBERT_pipeline(inputs=masked_sentences,
- top_k=3
- )
-
for w, s, rs in zip(masked_words, masked_sentences,results):
- print(s)
- print('original: {}'.format(w))
- for r in rs:
- print('\t {} {:0.2f}'.format(r['token_str'], r['score']))
- print()
-
M67 is one of the most studied [MASK] clusters. -original: open - open 0.87 - globular 0.07 - star 0.03 - -A solar twin is a star with [MASK] parameters and chemical composition very similar to our Sun. -original: atmospheric - fundamental 0.56 - physical 0.25 - stellar 0.05 - -The dynamical evolution of planets close to their star is affected by [MASK] effects -original: tidal - tidal 0.07 - electromagnetic 0.05 - electrostatic 0.04 - -The Kepler satellite collected high-precision long-term and continuous light [MASK] for more than 100,000 solar-type stars -original: curves - curves 0.43 - ##s 0.04 - conditions 0.04 - -The Local Group is composed of the Milky Way, the [MASK] Galaxy, and numerous smaller satellite galaxies. -original: Andromeda - Andromeda 0.99 - M31 0.00 - Sagittarius 0.00 - -Cepheid variables are used to determine the [MASK] to galaxies in the local universe. -original: distances - distances 0.79 - distance 0.21 - redshifts 0.00 - -Jets are created and sustained by [MASK] of matter onto a compact massive object. -original: accretion - accretion 0.79 - infall 0.13 - fall 0.02 - -A single star of one solar mass will evolve into a [MASK] dwarf. -original: white - white 0.77 - brown 0.19 - red 0.02 - -The Very Large Array observes the sky at [MASK] wavelengths. -original: radio - radio 0.29 - centimeter 0.10 - all 0.09 - -Elements heavier than [MASK] are generated in supernovae explosions. -original: iron - iron 0.34 - helium 0.16 - oxygen 0.07 - -Spitzer was the first [MASK] to fly in an Earth-trailing orbit. -original: spacecraft - satellite 0.42 - spacecraft 0.20 - observatory 0.16 - -Galaxy [MASK] can occur when two (or more) galaxies collide -original: mergers - . 0.26 - A 0.05 - 1 0.04 - -Dark [MASK] is a hypothetical form of matter thought to account for approximately 85% of the matter in the universe. -original: matter - energy 0.64 - Energy 0.24 - matter 0.10 - -The cosmic microwave background (CMB, CMBR), in Big Bang cosmology, is electromagnetic radiation which is a remnant from an early stage of the [MASK]. -original: satellite - universe 0.45 - Universe 0.26 - expansion 0.09 - -The Local Group of galaxies is pulled toward The Great [MASK]. -original: planet - Wall 0.96 - East 0.01 - Planet 0.00 - -The Moon is the only [MASK] of the Earth. -original: dwarf - satellite 0.38 - moon 0.31 - constituent 0.07 - -Galaxies are categorized according to their visual morphology as [MASK], spiral, or irregular. -original: supermassive - elliptical 0.92 - spheroidal 0.02 - irregular 0.01 - -Stars are made mostly of [MASK]. -original: galaxy - hydrogen 0.20 - helium 0.14 - carbon 0.12 - -Comet tails are created as comets approach the [MASK]. -original: interstellar - Sun 0.45 - sun 0.23 - Earth 0.19 - --
-