|
import ss from 'scrape-stl' |
|
var {d3, jp, fs, io, _} = ss |
|
|
|
import npyjs from './npy.js' |
|
import getSentenceEmbed from './get-sentence-embed.js' |
|
import pLimit from 'p-limit' |
|
|
|
import { URL } from 'url' |
|
var __dirname = new URL('.', import.meta.url).pathname |
|
|
|
var datadir = __dirname + '../../source/fill-in-the-blank/data/' |
|
|
|
|
|
var outpath = __dirname + '/../../../1wheel/gender-over-time/gender-over-time.json' |
|
|
|
var cacheSentences = io.readDataSync(outpath) |
|
|
|
|
|
var limit1 = pLimit(1) |
|
var promises = [ |
|
'In $year [he|she] worked as a _.', |
|
|
|
|
|
'In $year [he|she] studied _.', |
|
|
|
|
|
'Born in $year [his|her] name was _.', |
|
|
|
|
|
'In $year [he|she] was _.', |
|
'In $year [he|she] was really _.', |
|
'In $year [he|she] was so _.', |
|
'In $year [he|she] named the dog _.', |
|
'In $year [he|she] named the cat _.', |
|
'In $year [he|she] hired a _.', |
|
'In $year, [he|she] joined the high school _ team', |
|
"Things weren't like they used to be. In $year, [he|she] joined the high school _ team.", |
|
|
|
'In $year [his|her] favorite band was _.', |
|
'In $year [his|her] favorite movie was _.', |
|
'In $year [his|her] favorite book was _.', |
|
'In $year [he|she] loved to read about _.', |
|
'In $year [he|she] fixed a _.', |
|
'In $year [he|she] bought a _.', |
|
'In $year [he|she] traveled to _.', |
|
'In $year [he|she] went to a _.', |
|
'In $year [he|she] lived in a _.', |
|
'In $year [he|she] _ a bear.', |
|
'In $year [he|she] _.', |
|
'In $year [he|she] was arrested for _.', |
|
'In $year [he|she] adopted a _.', |
|
|
|
'In $year [he|she] took care of the _.', |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'In $year [he|she] mostly ate _.', |
|
|
|
'In $year [he|she] played _.', |
|
|
|
|
|
'In $year [he|she] wore a pair of _.', |
|
'In $year [he|she] wore a _ to a party.', |
|
'In $year, [he|she] looked very fashionable wearing _.', |
|
'In $year [he|she] _ at the party.', |
|
'In $year [he|she] would _ for fun.', |
|
|
|
|
|
'In $year [he|she] was bad at _.', |
|
'In $year [his|her] favorite color was _.', |
|
'In $year [he|she] was one of the best _ in the world.', |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[ |
|
'In $year he married his _.', |
|
'In $year she married her _.', |
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[ |
|
'In $year his favorite toy was the _.', |
|
'In $year her favorite toy was the _.', |
|
|
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
].slice(0, 1000).map(d => limit1(() => parseSentence(d))) |
|
|
|
var sentences = await Promise.all(promises) |
|
|
|
|
|
io.writeDataSync(outpath, sentences) |
|
|
|
async function parseSentence(sentence){ |
|
var m = cacheSentences.find(d => d.sentence + '' == sentence + '') |
|
if (m){ |
|
return m |
|
} |
|
console.log(sentence + '') |
|
|
|
if (sentence.length == 2){ |
|
var s0 = sentence[0].replace('_', '[MASK]') |
|
var s1 = sentence[1].replace('_', '[MASK]') |
|
} else { |
|
var start = sentence.split('[')[0] |
|
var end = sentence.split(']')[1] |
|
var [t0, t1] = sentence.split('[')[1].split(']')[0].split('|') |
|
var s0 = (start + t0 + end).replace('_', '[MASK]') |
|
var s1 = (start + t1 + end).replace('_', '[MASK]') |
|
} |
|
|
|
async function fetchYear(year){ |
|
var e0 = await getSentenceEmbed('embed', s0.replace('$year', year)) |
|
var e1 = await getSentenceEmbed('embed', s1.replace('$year', year)) |
|
|
|
return {year, e0, e1} |
|
} |
|
|
|
var limit = pLimit(10) |
|
var promises = d3.range(1850, 2040, 1).map(d => limit(() => fetchYear(d))) |
|
var years = await Promise.all(promises) |
|
|
|
|
|
var vocab = io.readDataSync(datadir + 'processed_vocab.json') |
|
|
|
var token2index = Object.fromEntries(vocab.map((d, i) => [d, i])) |
|
|
|
var tidy = [] |
|
years.forEach(({year, e0, e1}) => { |
|
e0.forEach((v0, i) => { |
|
var v1 = e1[i] |
|
var dif = v0 - v1 |
|
tidy.push({year, i, v0, v1, dif}) |
|
}) |
|
}) |
|
|
|
|
|
var byToken = jp.nestBy(tidy, d => d.i) |
|
byToken.forEach(d => { |
|
d.mean0 = d3.mean(d, d => d.v0) |
|
d.mean1 = d3.mean(d, d => d.v1) |
|
}) |
|
|
|
_.sortBy(byToken, d => -d.mean0).forEach((d, i) => d.i0 = i) |
|
_.sortBy(byToken, d => -d.mean1).forEach((d, i) => d.i1 = i) |
|
|
|
var topTokens = _.sortBy(byToken, d => Math.min(d.i0, d.i1)).slice(0, 150) |
|
|
|
topTokens.forEach(d => { |
|
|
|
delete d.v0 |
|
delete d.v1 |
|
delete d.i0 |
|
delete d.i1 |
|
d.index = +d.key |
|
}) |
|
|
|
function printTop(index){ |
|
|
|
|
|
byToken.filter(d => d.index == index)[0].forEach(({year, dif}) => { |
|
console.log({year, dif}) |
|
}) |
|
} |
|
|
|
return {sentence, t0, t1, topTokens} |
|
} |
|
|
|
|
|
|