|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mkcls_args = -n10 -c50 |
|
|
|
|
|
%/mgiza.cfg: m1=5 |
|
%/mgiza.cfg: m2=0 |
|
%/mgiza.cfg: mh=5 |
|
%/mgiza.cfg: m3=3 |
|
%/mgiza.cfg: m4=3 |
|
%/mgiza.cfg: nodumps=0 |
|
%/mgiza.cfg: onlyaldumps=0 |
|
%/mgiza.cfg: model4smoothfactor=0.4 |
|
%/mgiza.cfg: nsmooth=4 |
|
%/mgiza.cfg: NCPUS=8 |
|
|
|
|
|
symal_grow_diag_final_and = -a=g -d=yes -f=yes -b=yes |
|
symal_args = ${symal_grow_diag_final_and} |
|
|
|
|
|
|
|
|
|
gizaln = ${WDIR}/crp/trn/aln/giza |
|
giztmp = $(gizaln)/tmp |
|
gizout = $(gizaln) |
|
gizaln.in = ${WDIR}/crp/trn/pll/clean/ |
|
|
|
.PHONY: giza giza-prep |
|
|
|
giza: | $(gizout)/${L1}.txt.gz |
|
giza: | $(gizout)/${L2}.txt.gz |
|
giza: | $(gizout)/${L1}-${L2}.symal.gz |
|
@echo "GIZA WORD ALIGNMENT COMPLETE!" |
|
|
|
other = $(if $(findstring $(1),${L1}),${L2},${L1}) |
|
fwd = $(1)-$(call other,$(1)) |
|
bwd = $(call other,$(1))-$(1) |
|
|
|
$(gizout)/${L1}.txt.gz: a3file = $(gizout)/${L2}-${L1}.A3.final.gz |
|
$(gizout)/${L1}.txt.gz: | $(gizout)/${L2}-${L1}.A3.final.gz |
|
$(lock) |
|
(zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@ |
|
$(unlock) |
|
|
|
$(gizout)/${L2}.txt.gz: a3file = $(gizout)/${L1}-${L2}.A3.final.gz |
|
$(gizout)/${L2}.txt.gz: | $(gizout)/${L1}-${L2}.A3.final.gz |
|
$(lock) |
|
(zcat $(a3file) | perl -ne 'print if ++$$ctr%3 == 2;' | gzip > $@_) && mv $@_ $@ |
|
$(unlock) |
|
|
|
$(gizout)/${L1}-${L2}.symal.gz: A3fwd = $(gizout)/${L1}-${L2}.A3.final.gz |
|
$(gizout)/${L1}-${L2}.symal.gz: A3bwd = $(gizout)/${L2}-${L1}.A3.final.gz |
|
$(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L1}-${L2}.A3.final.gz |
|
$(gizout)/${L1}-${L2}.symal.gz: | $(gizout)/${L2}-${L1}.A3.final.gz $(giza2bal.pl) |
|
$(lock) |
|
$(giza2bal.pl) -d 'gunzip -c ${A3fwd}' -i 'gunzip -c ${A3bwd}' \ |
|
| $(symal) $(symal_args) | perl -pe 's/^.*{##}\s+//' | gzip > $@_ && mv $@_ $@ |
|
$(unlock) |
|
|
|
|
|
$(gizout)/%.A3.final.gz: | $(giztmp)/%/mgiza.DONE |
|
mkdir -p ${@D} |
|
$(lock) |
|
$(mgiza.merge) $(shell ls $(giztmp)/$*/$*.A3.final.part* 2>/dev/null) | gzip > $@_ |
|
mv $@_ $@ |
|
$(unlock) |
|
|
|
|
|
%/mgiza.DONE: | %/mgiza.cfg |
|
$(lock) |
|
$(mgiza) $| && touch $@ |
|
$(unlock) |
|
|
|
$(giztmp) $(gizout): |
|
mkdir -p $@ |
|
|
|
|
|
|
|
|
|
|
|
stream = find -L $(1) -type f -name "*.$(2)" -or -name "*.$(2).gz" | sort | xargs zcat -f |
|
mkcls_cmd = $(call stream,$(1),$(2)) | ${mkcls} $(mkcls_args) -p/dev/stdin -V$(3) opt |
|
|
|
|
|
|
|
|
|
|
|
$(giztmp)/%.vcb.classes: ${mkcls} |
|
$(giztmp)/%.vcb.classes: | $(gizaln.in) |
|
@echo CREATING $@ |
|
$(lock) |
|
mkdir -p $(@D) |
|
@$(call mkcls_cmd,$|,$*,$@_) && mv $@_ $@ |
|
@mv $@_.cats $@.cats |
|
$(unlock) |
|
|
|
|
|
$(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt |
|
$(giztmp)/${L1}.vcb: | $(giztmp)/${L1}-${L2}.snt |
|
$(giztmp)/${L2}.vcb: | $(giztmp)/${L1}-${L2}.snt |
|
$(giztmp)/${L2}-${L1}.snt: | $(giztmp)/${L1}-${L2}.snt |
|
|
|
$(giztmp)/${L1}-${L2}.snt: L1files = $(addsuffix .${L1}.gz, $(pll-clean)) |
|
$(giztmp)/${L1}-${L2}.snt: L2files = $(addsuffix .${L2}.gz, $(pll-clean)) |
|
$(giztmp)/${L1}-${L2}.snt: | $(giztmp) |
|
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L1}.gz, $(pll-clean)) |
|
$(giztmp)/${L1}-${L2}.snt: | $(addsuffix .${L2}.gz, $(pll-clean)) |
|
$(lock) |
|
$(plain2snt) \ |
|
<(ls $(L1files) | xargs zcat -f) \ |
|
<(ls $(L2files) | xargs zcat -f) \ |
|
-vcb1 $(giztmp)/${L1}.vcb_ -vcb2 $(giztmp)/${L2}.vcb_ \ |
|
-snt1 $(giztmp)/${L1}-${L2}.snt_ -snt2 $(giztmp)/${L2}-${L1}.snt_ |
|
mv $(giztmp)/${L1}.vcb_ $(giztmp)/${L1}.vcb |
|
mv $(giztmp)/${L2}.vcb_ $(giztmp)/${L2}.vcb |
|
mv $(giztmp)/${L1}-${L2}.snt_ $(giztmp)/${L1}-${L2}.snt |
|
mv $(giztmp)/${L2}-${L1}.snt_ $(giztmp)/${L2}-${L1}.snt |
|
$(unlock) |
|
|
|
|
|
$(giztmp)/${L1}-${L2}.cooc: V1 = $(giztmp)/${L1}.vcb |
|
$(giztmp)/${L1}-${L2}.cooc: V2 = $(giztmp)/${L2}.vcb |
|
$(giztmp)/${L2}-${L1}.cooc: V1 = $(giztmp)/${L2}.vcb |
|
$(giztmp)/${L2}-${L1}.cooc: V2 = $(giztmp)/${L1}.vcb |
|
$(giztmp)/%.cooc: | $(giztmp)/%.snt |
|
@echo CREATING $@ |
|
$(lock) |
|
$(snt2cooc) $@_ ${V1} ${V2} $| && mv $@_ $@ |
|
$(unlock) |
|
|
|
|
|
|
|
|
|
$(giztmp)/%/mgiza.cfg: SHELL=bash |
|
# --- CORPUS RESOURCES --------------------------------------------------------- |
|
$(giztmp)/%/mgiza.cfg: V1 = $(giztmp)/${FROM}.vcb |
|
$(giztmp)/%/mgiza.cfg: V2 = $(giztmp)/${TO}.vcb |
|
$(giztmp)/%/mgiza.cfg: SNT = $(giztmp)/${FROM}-${TO}.snt |
|
$(giztmp)/%/mgiza.cfg: COOC = $(giztmp)/${FROM}-${TO}.cooc |
|
$(giztmp)/%/mgiza.cfg: ODIR = $(giztmp)/${FROM}-${TO}/${FROM}-${TO} |
|
|
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: FROM = ${L1} |
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: TO = ${L2} |
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: FROM = ${L2} |
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: TO = ${L1} |
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.cooc |
|
$(giztmp)/${L1}-${L2}/mgiza.cfg: | $(giztmp)/${L1}-${L2}.snt |
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.cooc |
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | $(giztmp)/${L2}-${L1}.snt |
|
$(giztmp)/${L1}-${L2}/mgiza.cfg \ |
|
$(giztmp)/${L2}-${L1}/mgiza.cfg: | \ |
|
$(giztmp)/${L1}.vcb $(giztmp)/${L1}.vcb.classes \ |
|
$(giztmp)/${L2}.vcb $(giztmp)/${L2}.vcb.classes |
|
mkdir -p $(dir $@) |
|
touch $@ |
|
@echo "s ${V1}" >> $@ |
|
@echo "t ${V2}" >> $@ |
|
@echo "c ${SNT}" >> $@ |
|
@echo "cooc ${COOC}" >> $@ |
|
@echo "m1 ${m1}" >> $@ |
|
@echo "m2 ${m2}" >> $@ |
|
@echo "mh ${mh}" >> $@ |
|
@echo "m3 ${m3}" >> $@ |
|
@echo "m4 ${m4}" >> $@ |
|
@echo "t1 ${m1}" >> $@ |
|
@echo "t2 ${m2}" >> $@ |
|
@echo "th ${mh}" >> $@ |
|
@echo "t3 ${m3}" >> $@ |
|
@echo "t4 ${m4}" >> $@ |
|
@echo "o ${ODIR}" >> $@ |
|
@echo "model4smoothfactor ${model4smoothfactor}" >> $@ |
|
@echo "onlyaldumps ${onlyaldumps}" >> $@ |
|
@echo "nodumps ${nodumps}" >> $@ |
|
@echo "nsmooth ${nsmooth}" >> $@ |
|
@echo "NCPUS ${NCPUS}" >> $@ |
|
|
|
|
|
ifeq ($(gizaln),) |
|
$(warning Giza base directory not defined) |
|
endif |
|
ifeq ($(gizaln.in),) |
|
$(warning No directory for Giza++ training data specified!) |
|
endif |
|
|