Spaces:
Sleeping
Sleeping
Commit
·
251ecbb
1
Parent(s):
26ab567
lng indices
Browse files- const.py +1 -0
- lng/L2SCA/analyzeText.py +87 -35
const.py
CHANGED
@@ -1030,6 +1030,7 @@ used_indices = [
|
|
1030 |
63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
|
1031 |
257, 258, 261, 263, 272, 274
|
1032 |
]
|
|
|
1033 |
|
1034 |
eval_indices = [4,5,6,18,257,272]
|
1035 |
eval_indices = [used_indices.index(idx) for idx in eval_indices]
|
|
|
1030 |
63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
|
1031 |
257, 258, 261, 263, 272, 274
|
1032 |
]
|
1033 |
+
lftk_used_indices = [1, 7, 8, 9, 10, 11, 12, 17, 65, 68, 73, 78, 80, 198, 201, 202, 205, 207, 216, 218]
|
1034 |
|
1035 |
eval_indices = [4,5,6,18,257,272]
|
1036 |
eval_indices = [used_indices.index(idx) for idx in eval_indices]
|
lng/L2SCA/analyzeText.py
CHANGED
@@ -62,40 +62,94 @@ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
|
|
62 |
pre_path = 'lng/L2SCA'
|
63 |
|
64 |
#location of the Stanford parser
|
65 |
-
parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04
|
66 |
|
67 |
def sca(input_text):
|
68 |
-
inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
|
69 |
-
with open(inputFile, 'w') as f:
|
70 |
-
f.write(input_text + '\n')
|
71 |
-
|
72 |
-
#extract the name of the file being processed
|
73 |
output = []
|
74 |
-
|
75 |
-
#
|
76 |
-
|
77 |
-
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
for pattern in patternlist:
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
count = 0
|
97 |
patterncount.append(count)
|
98 |
|
|
|
99 |
#update frequencies of complex nominals, clauses, and T-units
|
100 |
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
101 |
patterncount[2]=patterncount[2]+patterncount[-3]
|
@@ -103,10 +157,12 @@ def sca(input_text):
|
|
103 |
patterncount[1]=patterncount[1]+patterncount[-1]
|
104 |
|
105 |
#word count
|
106 |
-
infile=open(parsedFile,"r")
|
107 |
-
content=infile.read()
|
108 |
-
w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
109 |
-
infile.close()
|
|
|
|
|
110 |
|
111 |
#add frequencies of words and other structures to output string
|
112 |
output.append(int(w))
|
@@ -139,8 +195,4 @@ def sca(input_text):
|
|
139 |
#list of 24 comma-delimited fields
|
140 |
# fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
141 |
|
142 |
-
#delete the temporary file holding the parse trees
|
143 |
-
os.remove(inputFile)
|
144 |
-
os.remove(parsedFile)
|
145 |
-
|
146 |
return output
|
|
|
62 |
pre_path = 'lng/L2SCA'
|
63 |
|
64 |
#location of the Stanford parser
|
65 |
+
parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04")
|
66 |
|
67 |
def sca(input_text):
|
|
|
|
|
|
|
|
|
|
|
68 |
output = []
|
69 |
+
# inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
|
70 |
+
# with open(inputFile, 'w') as f:
|
71 |
+
# f.write(input_text + '\n')
|
72 |
+
|
73 |
+
# #extract the name of the file being processed
|
74 |
+
# output = []
|
75 |
+
|
76 |
+
# #name a temporary file to hold the parse trees of the input file
|
77 |
+
# parsedFile=inputFile+".parsed"
|
78 |
+
|
79 |
+
# #parse the input file
|
80 |
+
# command=[parserPath, inputFile]
|
81 |
+
# with open(parsedFile, 'w') as f:
|
82 |
+
# subprocess.run(command, stdout = f,
|
83 |
+
# stderr = subprocess.DEVNULL
|
84 |
+
# )
|
85 |
+
|
86 |
+
|
87 |
+
# #list of counts of the patterns
|
88 |
+
# patterncount=[]
|
89 |
+
|
90 |
+
# #query the parse trees using the tregex patterns
|
91 |
+
# for pattern in patternlist:
|
92 |
+
# command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
|
93 |
+
# out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
|
94 |
+
# if len(out.stdout) > 0:
|
95 |
+
# count = int(out.stdout)
|
96 |
+
# else:
|
97 |
+
# count = 0
|
98 |
+
# patterncount.append(count)
|
99 |
+
|
100 |
+
# Parse directly into memory
|
101 |
+
stanford_parser_jar = os.path.join(parserPath, "stanford-parser.jar")
|
102 |
+
stanford_models_jar = os.path.join(parserPath, "stanford-parser-3.3.1-models.jar") #Correct the version
|
103 |
+
|
104 |
+
command = [
|
105 |
+
"java",
|
106 |
+
"-mx1500m",
|
107 |
+
"-cp",
|
108 |
+
f"{stanford_parser_jar}:{stanford_models_jar}:",
|
109 |
+
"edu.stanford.nlp.parser.lexparser.LexicalizedParser",
|
110 |
+
"-outputFormat", "penn", # Changed output format to penn
|
111 |
+
"-sentences", "newline",
|
112 |
+
"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
|
113 |
+
"-", # Read from standard input
|
114 |
+
]
|
115 |
+
|
116 |
+
# Parse the input text.
|
117 |
+
process = subprocess.run(command, input=input_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
|
118 |
+
parsed_output = process.stdout.decode()
|
119 |
+
|
120 |
+
# Word count
|
121 |
+
w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
|
122 |
+
|
123 |
+
# For each tregex pattern in the list, run tregex in filter mode
|
124 |
+
# so that it reads the parsed trees from standard input.
|
125 |
+
patterncount = []
|
126 |
for pattern in patternlist:
|
127 |
+
tregex_command = [
|
128 |
+
"java",
|
129 |
+
"-mx100m",
|
130 |
+
"-cp",
|
131 |
+
f"{pre_path}/stanford-tregex.jar",
|
132 |
+
"edu.stanford.nlp.trees.tregex.TregexPattern",
|
133 |
+
"-filter", # Use filter mode: read trees from stdin
|
134 |
+
"-C", # Suppress printing matches; only outputs the count
|
135 |
+
"-o", # Report each tree node only once as the root of a match
|
136 |
+
pattern # Supply the pattern as a command-line argument
|
137 |
+
]
|
138 |
+
tregex_process = subprocess.run(
|
139 |
+
tregex_command,
|
140 |
+
input=parsed_output.encode(),
|
141 |
+
stdout=subprocess.PIPE,
|
142 |
+
stderr=subprocess.DEVNULL,
|
143 |
+
)
|
144 |
+
tregex_output = tregex_process.stdout.decode().strip()
|
145 |
+
|
146 |
+
try:
|
147 |
+
count = int(tregex_output)
|
148 |
+
except ValueError:
|
149 |
count = 0
|
150 |
patterncount.append(count)
|
151 |
|
152 |
+
|
153 |
#update frequencies of complex nominals, clauses, and T-units
|
154 |
patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
|
155 |
patterncount[2]=patterncount[2]+patterncount[-3]
|
|
|
157 |
patterncount[1]=patterncount[1]+patterncount[-1]
|
158 |
|
159 |
#word count
|
160 |
+
# infile=open(parsedFile,"r")
|
161 |
+
# content=infile.read()
|
162 |
+
# w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
|
163 |
+
# infile.close()
|
164 |
+
w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
|
165 |
+
|
166 |
|
167 |
#add frequencies of words and other structures to output string
|
168 |
output.append(int(w))
|
|
|
195 |
#list of 24 comma-delimited fields
|
196 |
# fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
|
197 |
|
|
|
|
|
|
|
|
|
198 |
return output
|