mohdelgaar commited on
Commit
251ecbb
·
1 Parent(s): 26ab567

lng indices

Browse files
Files changed (2) hide show
  1. const.py +1 -0
  2. lng/L2SCA/analyzeText.py +87 -35
const.py CHANGED
@@ -1030,6 +1030,7 @@ used_indices = [
1030
  63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
1031
  257, 258, 261, 263, 272, 274
1032
  ]
 
1033
 
1034
  eval_indices = [4,5,6,18,257,272]
1035
  eval_indices = [used_indices.index(idx) for idx in eval_indices]
 
1030
  63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
1031
  257, 258, 261, 263, 272, 274
1032
  ]
1033
+ lftk_used_indices = [1, 7, 8, 9, 10, 11, 12, 17, 65, 68, 73, 78, 80, 198, 201, 202, 205, 207, 216, 218]
1034
 
1035
  eval_indices = [4,5,6,18,257,272]
1036
  eval_indices = [used_indices.index(idx) for idx in eval_indices]
lng/L2SCA/analyzeText.py CHANGED
@@ -62,40 +62,94 @@ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
62
  pre_path = 'lng/L2SCA'
63
 
64
  #location of the Stanford parser
65
- parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04/lexparser.sh")
66
 
67
  def sca(input_text):
68
- inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
69
- with open(inputFile, 'w') as f:
70
- f.write(input_text + '\n')
71
-
72
- #extract the name of the file being processed
73
  output = []
74
-
75
- #name a temporary file to hold the parse trees of the input file
76
- parsedFile=inputFile+".parsed"
77
-
78
- #parse the input file
79
- command=[parserPath, inputFile]
80
- with open(parsedFile, 'w') as f:
81
- subprocess.run(command, stdout = f,
82
- stderr = subprocess.DEVNULL
83
- )
84
-
85
-
86
- #list of counts of the patterns
87
- patterncount=[]
88
-
89
- #query the parse trees using the tregex patterns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  for pattern in patternlist:
91
- command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
92
- out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
93
- if len(out.stdout) > 0:
94
- count = int(out.stdout)
95
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  count = 0
97
  patterncount.append(count)
98
 
 
99
  #update frequencies of complex nominals, clauses, and T-units
100
  patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
101
  patterncount[2]=patterncount[2]+patterncount[-3]
@@ -103,10 +157,12 @@ def sca(input_text):
103
  patterncount[1]=patterncount[1]+patterncount[-1]
104
 
105
  #word count
106
- infile=open(parsedFile,"r")
107
- content=infile.read()
108
- w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
109
- infile.close()
 
 
110
 
111
  #add frequencies of words and other structures to output string
112
  output.append(int(w))
@@ -139,8 +195,4 @@ def sca(input_text):
139
  #list of 24 comma-delimited fields
140
  # fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
141
 
142
- #delete the temporary file holding the parse trees
143
- os.remove(inputFile)
144
- os.remove(parsedFile)
145
-
146
  return output
 
62
  pre_path = 'lng/L2SCA'
63
 
64
  #location of the Stanford parser
65
+ parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04")
66
 
67
  def sca(input_text):
 
 
 
 
 
68
  output = []
69
+ # inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
70
+ # with open(inputFile, 'w') as f:
71
+ # f.write(input_text + '\n')
72
+
73
+ # #extract the name of the file being processed
74
+ # output = []
75
+
76
+ # #name a temporary file to hold the parse trees of the input file
77
+ # parsedFile=inputFile+".parsed"
78
+
79
+ # #parse the input file
80
+ # command=[parserPath, inputFile]
81
+ # with open(parsedFile, 'w') as f:
82
+ # subprocess.run(command, stdout = f,
83
+ # stderr = subprocess.DEVNULL
84
+ # )
85
+
86
+
87
+ # #list of counts of the patterns
88
+ # patterncount=[]
89
+
90
+ # #query the parse trees using the tregex patterns
91
+ # for pattern in patternlist:
92
+ # command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
93
+ # out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
94
+ # if len(out.stdout) > 0:
95
+ # count = int(out.stdout)
96
+ # else:
97
+ # count = 0
98
+ # patterncount.append(count)
99
+
100
+ # Parse directly into memory
101
+ stanford_parser_jar = os.path.join(parserPath, "stanford-parser.jar")
102
+ stanford_models_jar = os.path.join(parserPath, "stanford-parser-3.3.1-models.jar") #Correct the version
103
+
104
+ command = [
105
+ "java",
106
+ "-mx1500m",
107
+ "-cp",
108
+ f"{stanford_parser_jar}:{stanford_models_jar}:",
109
+ "edu.stanford.nlp.parser.lexparser.LexicalizedParser",
110
+ "-outputFormat", "penn", # Changed output format to penn
111
+ "-sentences", "newline",
112
+ "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
113
+ "-", # Read from standard input
114
+ ]
115
+
116
+ # Parse the input text.
117
+ process = subprocess.run(command, input=input_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
118
+ parsed_output = process.stdout.decode()
119
+
120
+ # Word count
121
+ w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
122
+
123
+ # For each tregex pattern in the list, run tregex in filter mode
124
+ # so that it reads the parsed trees from standard input.
125
+ patterncount = []
126
  for pattern in patternlist:
127
+ tregex_command = [
128
+ "java",
129
+ "-mx100m",
130
+ "-cp",
131
+ f"{pre_path}/stanford-tregex.jar",
132
+ "edu.stanford.nlp.trees.tregex.TregexPattern",
133
+ "-filter", # Use filter mode: read trees from stdin
134
+ "-C", # Suppress printing matches; only outputs the count
135
+ "-o", # Report each tree node only once as the root of a match
136
+ pattern # Supply the pattern as a command-line argument
137
+ ]
138
+ tregex_process = subprocess.run(
139
+ tregex_command,
140
+ input=parsed_output.encode(),
141
+ stdout=subprocess.PIPE,
142
+ stderr=subprocess.DEVNULL,
143
+ )
144
+ tregex_output = tregex_process.stdout.decode().strip()
145
+
146
+ try:
147
+ count = int(tregex_output)
148
+ except ValueError:
149
  count = 0
150
  patterncount.append(count)
151
 
152
+
153
  #update frequencies of complex nominals, clauses, and T-units
154
  patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
155
  patterncount[2]=patterncount[2]+patterncount[-3]
 
157
  patterncount[1]=patterncount[1]+patterncount[-1]
158
 
159
  #word count
160
+ # infile=open(parsedFile,"r")
161
+ # content=infile.read()
162
+ # w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
163
+ # infile.close()
164
+ w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
165
+
166
 
167
  #add frequencies of words and other structures to output string
168
  output.append(int(w))
 
195
  #list of 24 comma-delimited fields
196
  # fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
197
 
 
 
 
 
198
  return output