Spaces:

mohdelgaar
/

LingConv

Sleeping

App Files Files Community

LingConv / lng /L2SCA /stanford-parser-full-2014-01-04 /bin /makeSerialized.csh

mohdelgaar

upload lng

b028d48 11 months ago

raw

history blame contribute delete

17.6 kB

	#!/bin/csh -f

	# This is the file we use to make the serialized grammars for the parser.
	# If you are on the Stanford NLP machines, you can use it to remake the
	# serialized parsers (such as when there have been incompatible software
	# changes). Don't forget to klog first so you can access the AFS corpora.
	#
	# If you are not on the Stanford NLP machines, then the script won't work
	# for you as is, since it contains hard-coded paths to various treebanks.
	# But it may still be useful to inspect it to see what options we used to
	# generate the various supplied grammars.
	#
	# NOTE: Output files in this script should ALWAYS use relative paths, so
	# that you can copy this script and run it in a different directory and
	# it will write output files there.
	#
	# usage:
	# cd /u/nlp/data/lexparser # to have files output in "usual" location
	# ./makeSerialized.csh
	#
	## Uncomment this bit to run it with older parser version
	# setenv CLASSPATH /u/nlp/distrib/lexparser-2004-03-24/javanlp.jar:

	if ( ! $?JAVANLP_HOME) then
	echo 'JAVANLP_HOME is not set'
	echo 'Add a line like setenv JAVANLP_HOME $HOME/javanlp to your environment'
	exit
	endif

	set wsjptb=/afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj
	# now ctb6
	set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed
	# now ctb7!
	set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg
	set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg
	set negra=/afs/ir/data/linguistic-data/NEGRA/penn-format-train-dev-test

	set host=`hostname \| cut -d. -f1`

	if ( ! -r $wsjptb) then
	echo "Can't read WSJ PTB. Maybe you forgot to klog??"
	exit
	endif

	mv -f serializedParsers.log serializedParsers.bak
	uptime > serializedParsers.log
	echo "Classpath is $CLASSPATH" >> serializedParsers.log

	# English WSJ 2-21 PCFG binary and text grammars

	( echo "Running wsjPCFG (goodPCFG) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log

	# English noTagSplit no rule compaction PCFG text grammar
	( echo "Running wsjPCFG-noTagSplit-noCompact on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noTagSplit -saveToTextFile wsjPCFG-noTagSplit.txt -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log

	# English WSJ 2-21 Factored binary

	## Not yet clear that goodFactored is better than -ijcai03 -- not on dev set
	# ( echo "Running wsjFactored (goodFactored) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDAtsv" -goodFactored -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
	( echo "Running wsjFactored (ijcai03 correctTags) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
	( echo "Running wsjFactored (ijcai03 replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log


	## "General English" models

	# english{Factored\|PCFG} is currently trained on:
	# - WSJ sections 1-21
	# - Genia as reformatted by Andrew Clegg, his training split
	# - 2 English Chinese Translation Treebank and 3 English Arabic Translation
	# Treebank files backported to the original treebank annotation standards
	# (by us)
	# - 95 sentences parsed by us (mainly questions and imperatives; a few from
	# recent newswire).

	# /u/nlp/data/genia/sentences_cleaned.tree

	# "General English" Factored binary



	( echo "Running englishFactored (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log

	# "General English" PCFG binary

	( echo "Running englishPCFG (from treebank) on $host server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log


	# "General English" PCFG, case insensitive, binary

	( echo "Running caseless englishPCFG (from treebank) on $host server" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 ) >>& ./serializedParsers.log


	# English WSJ 2-21 PCFG simplified grammar
	# This dumbed down parser is used by the RNN parser.
	# See /scr/nlp/data/dvparser for more details.
	( echo "Running wsj pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile wsjPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log

	# English with extras PCFG simplified grammar
	# This dumbed down parser is used by the RNN parser.
	# See /scr/nlp/data/dvparser for more details.
	( echo "Running english pcfg (simplified for use in the RNN parser) on $host -server" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj 100-2199,9000-9099 -train2 /u/nlp/data/lexparser/extraTrain 1-4000 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log


	# Xinhua Mainland Chinese PCFG binary

	( echo "Running xinhuaPCFG on $host -server" ; time java -server -mx800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -saveToSerializedFile xinhuaPCFG.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log
	# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
	# newer train list (Galen and Huihsin): 026-270,301-499,600-999
	# this is all Xinhua minus Stanford devel and Bikel test

	# Xinhua Mainland Chinese Factored binary

	( echo "Running xinhuaFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -scTags -saveToSerializedFile xinhuaFactored.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log

	# Mixed dialect Chinese on lots of data (with chineseFactored)

	( echo "Running chineseFactored on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -saveToSerializedFile chineseFactored.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
	# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
	# newer train list (Galen and Huihsin): 026-270,301-499,600-999
	# this is all Xinhua minus Stanford devel and Bikel test
	# CTB files 001-499, 555-589,597-1000 are from newswire of
	# XinHua.
	# Files 500-554 are Information Services Department of HKSAR.
	# Files 590-596 and 1001-1151 are Sinorama articles, more of literature
	# nature and from Taiwan.
	# Files 2000-3145 are ACE broadcast news (from where?). We only use a few for now.

	# Mixed dialect Chinese PCFG on lots of data

	( echo "Running chinesePCFG on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -useUnicodeType -saveToSerializedFile chinesePCFG.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log
	# new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041
	# newer train list (Galen and Huihsin): 026-270,301-499,600-999
	# this is all Xinhua minus Stanford devel and Bikel test


	# Chinese parser for unsegmented Chinese

	( echo "Running xinhuaFactoredSegmenting on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -segmentMarkov -train $ctb 26-270,301-499,600-999 -sctags -acl03chinese -saveToSerializedFile xinhuaFactoredSegmenting.ser.gz ) >>& ./serializedParsers.log
	java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 xinhuaFactoredSegmenting.ser.gz /u/nlp/data/lexparser/chinese-onesent-unseg-utf8.txt >>& ./serializedParsers.log


	# It used to be the case that explicitly saying tLPP on command line was
	# needed for file encoding. But it has been fixed.
	# ( echo "Running xinhuaFactored from serialized check on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log
	# This now works
	( echo "Running xinhuaFactored from serialized (check without specifying -tLPP) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log

	( echo "Running chinesePCFG (simplified for use in the RNN parser) on $host -server" ; time java -server -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -PCFG -compactGrammar 0 -saveToSerializedFile chinesePCFG-simple.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log

	# German Factored binary from Negra (version 2)
	# $negra 3 is the dev set

	( echo "Running germanFactored on $host -server" ; time java -server -mx5g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanFactored.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log

	# German PCFG from Negra (version 2)

	( echo "Running germanPCFG on $host -server" ; time java -server -mx2g edu.stanford.nlp.parser.lexparser.LexicalizedParser -v -evals tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanPCFG.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log

	# German Dependency parser
	# This requires normalizing the dependency output to strip boundary symbol.
	# ( echo "Running germanDep on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -dep -hMarkov 1 -maxLength 40 -saveToSerializedFile germanDep.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log



	########
	# The languages below this line use TreebankPreprocessor for pre-processing prior to training
	########
	set mydir=`pwd`
	set data_dir=/u/nlp/data/lexparser/trees
	set tree_pipe=$JAVANLP_HOME/projects/core/scripts/run-tb-preproc
	set train_sh=$JAVANLP_HOME/projects/core/scripts/lexparser-lang-train-test.sh

	if( ! -e $data_dir ) then
	mkdir $data_dir
	endif

	########
	# ARABIC
	########
	set ar_data_dir=$data_dir/Arabic
	set ar_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/arabic/pipeline/configurations/atb-latest.conf
	set ar_train_args="Arabic 40 $ar_data_dir/2-Unvoc-All.utf8.txt $ar_data_dir/2-Unvoc-Dev.utf8.txt BASELINE_ar -saveToSerializedFile arabicFactored.ser.gz"

	if( ! -e $ar_data_dir ) then
	mkdir $ar_data_dir
	endif

	echo Running $tree_pipe -p $ar_data_dir -v $ar_conf_file >>& ./serializedParsers.log
	$tree_pipe -p $ar_data_dir -v $ar_conf_file >& $ar_data_dir/build.log

	echo "" >>& ./serializedParsers.log
	( echo "Training Arabic Factored grammar using baseline feature set" ; time $train_sh $ar_train_args ) >>& ./serializedParsers.log


	########
	# FRENCH
	########
	set fr_data_dir=$data_dir/French
	set fr_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/configurations/ftb-latest.conf
	set fr_train_args="French 40 $fr_data_dir/FTB-All.utf8.txt $fr_data_dir/FTB-Dev.utf8.txt BASELINE_fr -saveToSerializedFile frenchFactored.ser.gz"

	if( ! -e $fr_data_dir ) then
	mkdir $fr_data_dir
	endif

	echo Running $tree_pipe -p $fr_data_dir -v $fr_conf_file >>& ./serializedParsers.log
	$tree_pipe -p $fr_data_dir -v $fr_conf_file >& $fr_data_dir/build.log

	echo "" >>& ./serializedParsers.log
	echo time $train_sh $fr_train_args >>& ./serializedParsers.log
	( echo "Training French Factored grammar using baseline feature set" ; time $train_sh $fr_train_args ) >>& ./serializedParsers.log




	## English just to check parser code regression (not saved)

	## Just for reference
	( echo "Running wsjPCFG (acl03pcfg replication) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -acl03pcfg -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log

	## See if same results from serialized parser
	( echo "Running wsjFactored (ijcai03 from serialized) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromSerializedFile wsjFactored.ser.gz -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log
	# ( echo "Running wsjFactored (ijcai03 with nodeprune) on $host -server" ; time java -server -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -compactGrammar 0 -nodePrune true -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log

	## See if same results from text grammar parser
	( echo "Running wsjFactored (ijcai03 from textGrammar) on $host -server" ; time java -server -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromTextFile wsjFactored.txt -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log

	uptime >> serializedParsers.log

	mv -f serializedParsersPerformance.last serializedParsersPerformance.2ndlast
	mv -f serializedParsersPerformance.current serializedParsersPerformance.last
	echo -n "Parser run by $USER on " > serializedParsersPerformance.current
	date >> serializedParsersPerformance.current
	grep 'N: 253\\|N: 393\\|Done testing on treebank\\|Running \\| summary ' serializedParsers.log >> serializedParsersPerformance.current
	echo >> serializedParsersPerformance.current
	echo >> serializedParsersPerformance.current

	cat serializedParsersPerformance.current >> serializedParsersPerformance.txt

	cp -f serializedParsers.last serializedParsers.2ndlast
	cp -f serializedParsers.current serializedParsers.last
	cp -f serializedParsers.log serializedParsers.current