Spaces:
Runtime error
Runtime error
Delete src/distinct_n
Browse files- src/distinct_n/.gitignore +0 -58
- src/distinct_n/.idea/Distinct-N.iml +0 -11
- src/distinct_n/.idea/encodings.xml +0 -4
- src/distinct_n/.idea/misc.xml +0 -7
- src/distinct_n/.idea/modules.xml +0 -8
- src/distinct_n/.idea/other.xml +0 -6
- src/distinct_n/.idea/vcs.xml +0 -6
- src/distinct_n/.idea/webResources.xml +0 -14
- src/distinct_n/A Diversity-Promoting Objective Function for Neural Conversation Models.pdf +0 -0
- src/distinct_n/LICENSE.txt +0 -202
- src/distinct_n/README.md +0 -30
- src/distinct_n/bin/distinct_metric.py +0 -29
- src/distinct_n/bin/score.sh +0 -6
- src/distinct_n/distinct_n/metrics.py +0 -33
- src/distinct_n/distinct_n/test.py +0 -32
- src/distinct_n/distinct_n/utils.py +0 -90
- src/distinct_n/setup.py +0 -29
- src/distinct_n/testdata/bigram.txt +0 -1
- src/distinct_n/testdata/unigram.txt +0 -1
src/distinct_n/.gitignore
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
# Byte-compiled / optimized / DLL files
|
2 |
-
__pycache__/
|
3 |
-
*.py[cod]
|
4 |
-
state.py
|
5 |
-
|
6 |
-
# C extensions
|
7 |
-
*.so
|
8 |
-
|
9 |
-
# Distribution / packaging
|
10 |
-
.Python
|
11 |
-
env/
|
12 |
-
build/
|
13 |
-
develop-eggs/
|
14 |
-
dist/
|
15 |
-
downloads/
|
16 |
-
eggs/
|
17 |
-
.eggs/
|
18 |
-
lib/
|
19 |
-
lib64/
|
20 |
-
parts/
|
21 |
-
sdist/
|
22 |
-
var/
|
23 |
-
*.egg-info/
|
24 |
-
.installed.cfg
|
25 |
-
*.egg
|
26 |
-
|
27 |
-
# PyInstaller
|
28 |
-
# Usually these files are written by a python script from a template
|
29 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
30 |
-
*.manifest
|
31 |
-
*.spec
|
32 |
-
|
33 |
-
# Installer logs
|
34 |
-
pip-log.txt
|
35 |
-
pip-delete-this-directory.txt
|
36 |
-
|
37 |
-
# Unit test / coverage reports
|
38 |
-
htmlcov/
|
39 |
-
.tox/
|
40 |
-
.coverage
|
41 |
-
.coverage.*
|
42 |
-
.cache
|
43 |
-
nosetests.xml
|
44 |
-
coverage.xml
|
45 |
-
*,cover
|
46 |
-
|
47 |
-
# Translations
|
48 |
-
*.mo
|
49 |
-
*.pot
|
50 |
-
|
51 |
-
# Django stuff:
|
52 |
-
*.log
|
53 |
-
|
54 |
-
# Sphinx documentation
|
55 |
-
docs/_build/
|
56 |
-
|
57 |
-
# PyBuilder
|
58 |
-
target/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/Distinct-N.iml
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<module type="PYTHON_MODULE" version="4">
|
3 |
-
<component name="NewModuleRootManager">
|
4 |
-
<content url="file://$MODULE_DIR$">
|
5 |
-
<sourceFolder url="file://$MODULE_DIR$/distinct_n" isTestSource="false" />
|
6 |
-
<excludeFolder url="file://$MODULE_DIR$/docs" />
|
7 |
-
</content>
|
8 |
-
<orderEntry type="jdk" jdkName="Python 3.6 (Metrics)" jdkType="Python SDK" />
|
9 |
-
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
-
</component>
|
11 |
-
</module>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/encodings.xml
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
|
4 |
-
</project>
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/misc.xml
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="JavaScriptSettings">
|
4 |
-
<option name="languageLevel" value="ES6" />
|
5 |
-
</component>
|
6 |
-
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (tensorflow)" project-jdk-type="Python SDK" />
|
7 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/modules.xml
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="ProjectModuleManager">
|
4 |
-
<modules>
|
5 |
-
<module fileurl="file://$PROJECT_DIR$/.idea/Distinct-N.iml" filepath="$PROJECT_DIR$/.idea/Distinct-N.iml" />
|
6 |
-
</modules>
|
7 |
-
</component>
|
8 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/other.xml
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="PySciProjectComponent">
|
4 |
-
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
|
5 |
-
</component>
|
6 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/vcs.xml
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="VcsDirectoryMappings">
|
4 |
-
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
-
</component>
|
6 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/.idea/webResources.xml
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
-
<project version="4">
|
3 |
-
<component name="WebResourcesPaths">
|
4 |
-
<contentEntries>
|
5 |
-
<entry url="file://$PROJECT_DIR$">
|
6 |
-
<entryData>
|
7 |
-
<resourceRoots>
|
8 |
-
<path value="file://$PROJECT_DIR$/testdata" />
|
9 |
-
</resourceRoots>
|
10 |
-
</entryData>
|
11 |
-
</entry>
|
12 |
-
</contentEntries>
|
13 |
-
</component>
|
14 |
-
</project>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/A Diversity-Promoting Objective Function for Neural Conversation Models.pdf
DELETED
Binary file (200 kB)
|
|
src/distinct_n/LICENSE.txt
DELETED
@@ -1,202 +0,0 @@
|
|
1 |
-
|
2 |
-
Apache License
|
3 |
-
Version 2.0, January 2004
|
4 |
-
http://www.apache.org/licenses/
|
5 |
-
|
6 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
7 |
-
|
8 |
-
1. Definitions.
|
9 |
-
|
10 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
11 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
12 |
-
|
13 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
14 |
-
the copyright owner that is granting the License.
|
15 |
-
|
16 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
17 |
-
other entities that control, are controlled by, or are under common
|
18 |
-
control with that entity. For the purposes of this definition,
|
19 |
-
"control" means (i) the power, direct or indirect, to cause the
|
20 |
-
direction or management of such entity, whether by contract or
|
21 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
22 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
23 |
-
|
24 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
25 |
-
exercising permissions granted by this License.
|
26 |
-
|
27 |
-
"Source" form shall mean the preferred form for making modifications,
|
28 |
-
including but not limited to software source code, documentation
|
29 |
-
source, and configuration files.
|
30 |
-
|
31 |
-
"Object" form shall mean any form resulting from mechanical
|
32 |
-
transformation or translation of a Source form, including but
|
33 |
-
not limited to compiled object code, generated documentation,
|
34 |
-
and conversions to other media types.
|
35 |
-
|
36 |
-
"Work" shall mean the work of authorship, whether in Source or
|
37 |
-
Object form, made available under the License, as indicated by a
|
38 |
-
copyright notice that is included in or attached to the work
|
39 |
-
(an example is provided in the Appendix below).
|
40 |
-
|
41 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
42 |
-
form, that is based on (or derived from) the Work and for which the
|
43 |
-
editorial revisions, annotations, elaborations, or other modifications
|
44 |
-
represent, as a whole, an original work of authorship. For the purposes
|
45 |
-
of this License, Derivative Works shall not include works that remain
|
46 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
47 |
-
the Work and Derivative Works thereof.
|
48 |
-
|
49 |
-
"Contribution" shall mean any work of authorship, including
|
50 |
-
the original version of the Work and any modifications or additions
|
51 |
-
to that Work or Derivative Works thereof, that is intentionally
|
52 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
53 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
54 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
55 |
-
means any form of electronic, verbal, or written communication sent
|
56 |
-
to the Licensor or its representatives, including but not limited to
|
57 |
-
communication on electronic mailing lists, source code control systems,
|
58 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
59 |
-
Licensor for the purpose of discussing and improving the Work, but
|
60 |
-
excluding communication that is conspicuously marked or otherwise
|
61 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
62 |
-
|
63 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
64 |
-
on behalf of whom a Contribution has been received by Licensor and
|
65 |
-
subsequently incorporated within the Work.
|
66 |
-
|
67 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
68 |
-
this License, each Contributor hereby grants to You a perpetual,
|
69 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
70 |
-
copyright license to reproduce, prepare Derivative Works of,
|
71 |
-
publicly display, publicly perform, sublicense, and distribute the
|
72 |
-
Work and such Derivative Works in Source or Object form.
|
73 |
-
|
74 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
75 |
-
this License, each Contributor hereby grants to You a perpetual,
|
76 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
77 |
-
(except as stated in this section) patent license to make, have made,
|
78 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
79 |
-
where such license applies only to those patent claims licensable
|
80 |
-
by such Contributor that are necessarily infringed by their
|
81 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
82 |
-
with the Work to which such Contribution(s) was submitted. If You
|
83 |
-
institute patent litigation against any entity (including a
|
84 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
85 |
-
or a Contribution incorporated within the Work constitutes direct
|
86 |
-
or contributory patent infringement, then any patent licenses
|
87 |
-
granted to You under this License for that Work shall terminate
|
88 |
-
as of the date such litigation is filed.
|
89 |
-
|
90 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
91 |
-
Work or Derivative Works thereof in any medium, with or without
|
92 |
-
modifications, and in Source or Object form, provided that You
|
93 |
-
meet the following conditions:
|
94 |
-
|
95 |
-
(a) You must give any other recipients of the Work or
|
96 |
-
Derivative Works a copy of this License; and
|
97 |
-
|
98 |
-
(b) You must cause any modified files to carry prominent notices
|
99 |
-
stating that You changed the files; and
|
100 |
-
|
101 |
-
(c) You must retain, in the Source form of any Derivative Works
|
102 |
-
that You distribute, all copyright, patent, trademark, and
|
103 |
-
attribution notices from the Source form of the Work,
|
104 |
-
excluding those notices that do not pertain to any part of
|
105 |
-
the Derivative Works; and
|
106 |
-
|
107 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
108 |
-
distribution, then any Derivative Works that You distribute must
|
109 |
-
include a readable copy of the attribution notices contained
|
110 |
-
within such NOTICE file, excluding those notices that do not
|
111 |
-
pertain to any part of the Derivative Works, in at least one
|
112 |
-
of the following places: within a NOTICE text file distributed
|
113 |
-
as part of the Derivative Works; within the Source form or
|
114 |
-
documentation, if provided along with the Derivative Works; or,
|
115 |
-
within a display generated by the Derivative Works, if and
|
116 |
-
wherever such third-party notices normally appear. The contents
|
117 |
-
of the NOTICE file are for informational purposes only and
|
118 |
-
do not modify the License. You may add Your own attribution
|
119 |
-
notices within Derivative Works that You distribute, alongside
|
120 |
-
or as an addendum to the NOTICE text from the Work, provided
|
121 |
-
that such additional attribution notices cannot be construed
|
122 |
-
as modifying the License.
|
123 |
-
|
124 |
-
You may add Your own copyright statement to Your modifications and
|
125 |
-
may provide additional or different license terms and conditions
|
126 |
-
for use, reproduction, or distribution of Your modifications, or
|
127 |
-
for any such Derivative Works as a whole, provided Your use,
|
128 |
-
reproduction, and distribution of the Work otherwise complies with
|
129 |
-
the conditions stated in this License.
|
130 |
-
|
131 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
132 |
-
any Contribution intentionally submitted for inclusion in the Work
|
133 |
-
by You to the Licensor shall be under the terms and conditions of
|
134 |
-
this License, without any additional terms or conditions.
|
135 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
136 |
-
the terms of any separate license agreement you may have executed
|
137 |
-
with Licensor regarding such Contributions.
|
138 |
-
|
139 |
-
6. Trademarks. This License does not grant permission to use the trade
|
140 |
-
names, trademarks, service marks, or product names of the Licensor,
|
141 |
-
except as required for reasonable and customary use in describing the
|
142 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
143 |
-
|
144 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
145 |
-
agreed to in writing, Licensor provides the Work (and each
|
146 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
147 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
148 |
-
implied, including, without limitation, any warranties or conditions
|
149 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
150 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
151 |
-
appropriateness of using or redistributing the Work and assume any
|
152 |
-
risks associated with Your exercise of permissions under this License.
|
153 |
-
|
154 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
155 |
-
whether in tort (including negligence), contract, or otherwise,
|
156 |
-
unless required by applicable law (such as deliberate and grossly
|
157 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
158 |
-
liable to You for damages, including any direct, indirect, special,
|
159 |
-
incidental, or consequential damages of any character arising as a
|
160 |
-
result of this License or out of the use or inability to use the
|
161 |
-
Work (including but not limited to damages for loss of goodwill,
|
162 |
-
work stoppage, computer failure or malfunction, or any and all
|
163 |
-
other commercial damages or losses), even if such Contributor
|
164 |
-
has been advised of the possibility of such damages.
|
165 |
-
|
166 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
167 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
168 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
169 |
-
or other liability obligations and/or rights consistent with this
|
170 |
-
License. However, in accepting such obligations, You may act only
|
171 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
172 |
-
of any other Contributor, and only if You agree to indemnify,
|
173 |
-
defend, and hold each Contributor harmless for any liability
|
174 |
-
incurred by, or claims asserted against, such Contributor by reason
|
175 |
-
of your accepting any such warranty or additional liability.
|
176 |
-
|
177 |
-
END OF TERMS AND CONDITIONS
|
178 |
-
|
179 |
-
APPENDIX: How to apply the Apache License to your work.
|
180 |
-
|
181 |
-
To apply the Apache License to your work, attach the following
|
182 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
183 |
-
replaced with your own identifying information. (Don't include
|
184 |
-
the brackets!) The text should be enclosed in the appropriate
|
185 |
-
comment syntax for the file format. We also recommend that a
|
186 |
-
file or class name and description of purpose be included on the
|
187 |
-
same "printed page" as the copyright notice for easier
|
188 |
-
identification within third-party archives.
|
189 |
-
|
190 |
-
Copyright [yyyy] [name of copyright owner]
|
191 |
-
|
192 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
193 |
-
you may not use this file except in compliance with the License.
|
194 |
-
You may obtain a copy of the License at
|
195 |
-
|
196 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
197 |
-
|
198 |
-
Unless required by applicable law or agreed to in writing, software
|
199 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
200 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
201 |
-
See the License for the specific language governing permissions and
|
202 |
-
limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/README.md
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
# Distinct-N
|
2 |
-
Distinct-N, most notably distinct-1 and distinct-2, is metric that measures the
|
3 |
-
diversity of a sentence. It focuses on the number of *distinct* n-gram of a sentence and thus
|
4 |
-
penalizes sentences with lots of repeated words. The metric is free of any *reference* or *ground truth*
|
5 |
-
sentence and devotes totally to the property of a sentence (generated by the system).
|
6 |
-
It is proposed by Jiwei Li et.al in the paper *A Diversity-Promoting Objective Function for Neural Conversation Models*.
|
7 |
-
|
8 |
-
# Definitions
|
9 |
-
The original paper coined *Distinct-N* as:
|
10 |
-
|
11 |
-
We report degree of diversity by calculating the number of distinct unigrams and bigrams in generated responses.
|
12 |
-
The value is scaled by total number of generated tokens to avoid favoring long sentences
|
13 |
-
|
14 |
-
which is exactly what we have mentioned before.
|
15 |
-
|
16 |
-
# Usage
|
17 |
-
```bash
|
18 |
-
$ python distinct_metric.py -n N_NGRAMS PREDICTION
|
19 |
-
```
|
20 |
-
|
21 |
-
|
22 |
-
where `N_GRAMS` is the length of token sequence to count as unique within one sentence.
|
23 |
-
`PREDICTION` is the prediction or response your model generates with one utterance (sentence) per line.
|
24 |
-
|
25 |
-
|
26 |
-
# Dependencies
|
27 |
-
`python>=3.6.1`
|
28 |
-
|
29 |
-
# References
|
30 |
-
[1] A Diversity-Promoting Objective Function for Neural Conversation Models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/bin/distinct_metric.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import logging
|
3 |
-
|
4 |
-
from distinct_n import distinct_n_sentence_level
|
5 |
-
from pathlib import Path
|
6 |
-
from agenda.metric_helper import write_score
|
7 |
-
|
8 |
-
NAME = 'distinct_n'
|
9 |
-
|
10 |
-
if __name__ == '__main__':
|
11 |
-
parser = argparse.ArgumentParser()
|
12 |
-
parser.add_argument('hypothesis', help="predicted text file, one example per line")
|
13 |
-
parser.add_argument('-n', dest='n_range', type=int, nargs='+', help="n to use as in distinct-N")
|
14 |
-
parser.add_argument('--output_dir')
|
15 |
-
args = parser.parse_args()
|
16 |
-
|
17 |
-
logging.basicConfig(level=logging.INFO)
|
18 |
-
logging.info('loading hypothesis file...')
|
19 |
-
with open(args.hypothesis) as f:
|
20 |
-
hypothesis = [sentence.split() for sentence in f.readlines()]
|
21 |
-
|
22 |
-
output_dir = Path(args.output_dir)
|
23 |
-
for n in args.n_range:
|
24 |
-
write_score(
|
25 |
-
name=NAME,
|
26 |
-
output=output_dir.joinpath(f'{NAME}_{n}').with_suffix('.json'),
|
27 |
-
params={'n': n},
|
28 |
-
scores=[distinct_n_sentence_level(s, n) for s in hypothesis],
|
29 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/bin/score.sh
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
#!/usr/bin/env bash
|
2 |
-
|
3 |
-
HYPO=/home/cgsdfc/UbuntuDialogueCorpus/ResponseContextPairs/ModelPredictions/VHRED/First_VHRED_BeamSearch_5_GeneratedTestResponses.txt_First.txt
|
4 |
-
DIR=/home/cgsdfc/Result/Test
|
5 |
-
|
6 |
-
python bin/distinct_metric.py --output_dir $DIR $HYPO -n 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/distinct_n/metrics.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
from src.distinct_n.distinct_n.utils import ngrams
|
2 |
-
|
3 |
-
__all__ = ["distinct_n_sentence_level", "distinct_n_corpus_level"]
|
4 |
-
|
5 |
-
|
6 |
-
def distinct_n_sentence_level(sentence, n):
|
7 |
-
"""
|
8 |
-
Compute distinct-N for a single sentence.
|
9 |
-
:param sentence: a list of words.
|
10 |
-
:param n: int, ngram.
|
11 |
-
:return: float, the metric value.
|
12 |
-
"""
|
13 |
-
if len(sentence) == 0:
|
14 |
-
return 0.0 # Prevent a zero division
|
15 |
-
# distinct_ngrams = set(ngrams(sentence, n))
|
16 |
-
# print(ngrams(sentence, n))
|
17 |
-
return list(set(ngrams(sentence, n)))
|
18 |
-
# return len(distinct_ngrams) / len(sentence)
|
19 |
-
|
20 |
-
|
21 |
-
def distinct_n_corpus_level(sentences, n):
|
22 |
-
"""
|
23 |
-
Compute average distinct-N of a list of sentences (the corpus).
|
24 |
-
:param sentences: a list of sentence.
|
25 |
-
:param n: int, ngram.
|
26 |
-
:return: float, the average value.
|
27 |
-
"""
|
28 |
-
temp = []
|
29 |
-
length = 0
|
30 |
-
for sentence in sentences:
|
31 |
-
length += len(sentence)
|
32 |
-
temp.extend(distinct_n_sentence_level(sentence, n))
|
33 |
-
return len(set(temp)) / length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/distinct_n/test.py
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
import unittest
|
2 |
-
|
3 |
-
from distinct_n import distinct_n_sentence_level
|
4 |
-
from distinct_n import distinct_n_corpus_level
|
5 |
-
|
6 |
-
|
7 |
-
class TestDistinctN(unittest.TestCase):
|
8 |
-
def test_unigram(self):
|
9 |
-
sentence = "the the the the the".split()
|
10 |
-
self.assertAlmostEqual(
|
11 |
-
distinct_n_sentence_level(sentence, 1), 0.2
|
12 |
-
)
|
13 |
-
sentence = "the the the the cat".split()
|
14 |
-
self.assertAlmostEqual(
|
15 |
-
distinct_n_sentence_level(sentence, 1), 0.4
|
16 |
-
)
|
17 |
-
|
18 |
-
def test_bigram(self):
|
19 |
-
sentence = "the cat sat on the".split()
|
20 |
-
self.assertAlmostEqual(
|
21 |
-
distinct_n_sentence_level(sentence, 2), 0.8
|
22 |
-
)
|
23 |
-
|
24 |
-
def test_corpus_level(self):
|
25 |
-
sentences = [
|
26 |
-
'the cat sat on the mat'.split(),
|
27 |
-
'mat the on sat cat the'.split(),
|
28 |
-
'i do not know'.split(),
|
29 |
-
'Sorry but i do not know'.split(),
|
30 |
-
]
|
31 |
-
self.assertAlmostEqual(0.916666, distinct_n_corpus_level(sentences, 1), delta=1e-5)
|
32 |
-
self.assertAlmostEqual(0.8125, distinct_n_corpus_level(sentences, 2), delta=1e-5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/distinct_n/utils.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Copied from nltk.ngrams().
|
3 |
-
"""
|
4 |
-
from itertools import chain
|
5 |
-
|
6 |
-
__all__ = ["ngrams"]
|
7 |
-
|
8 |
-
|
9 |
-
def pad_sequence(sequence, n, pad_left=False, pad_right=False,
|
10 |
-
left_pad_symbol=None, right_pad_symbol=None):
|
11 |
-
"""
|
12 |
-
Returns a padded sequence of items before ngram extraction.
|
13 |
-
|
14 |
-
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
|
15 |
-
['<s>', 1, 2, 3, 4, 5, '</s>']
|
16 |
-
>>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
|
17 |
-
['<s>', 1, 2, 3, 4, 5]
|
18 |
-
>>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
|
19 |
-
[1, 2, 3, 4, 5, '</s>']
|
20 |
-
|
21 |
-
:param sequence: the source data to be padded
|
22 |
-
:type sequence: sequence or iter
|
23 |
-
:param n: the degree of the ngrams
|
24 |
-
:type n: int
|
25 |
-
:param pad_left: whether the ngrams should be left-padded
|
26 |
-
:type pad_left: bool
|
27 |
-
:param pad_right: whether the ngrams should be right-padded
|
28 |
-
:type pad_right: bool
|
29 |
-
:param left_pad_symbol: the symbol to use for left padding (default is None)
|
30 |
-
:type left_pad_symbol: any
|
31 |
-
:param right_pad_symbol: the symbol to use for right padding (default is None)
|
32 |
-
:type right_pad_symbol: any
|
33 |
-
:rtype: sequence or iter
|
34 |
-
"""
|
35 |
-
sequence = iter(sequence)
|
36 |
-
if pad_left:
|
37 |
-
sequence = chain((left_pad_symbol,) * (n - 1), sequence)
|
38 |
-
if pad_right:
|
39 |
-
sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
|
40 |
-
return sequence
|
41 |
-
|
42 |
-
|
43 |
-
def ngrams(sequence, n, pad_left=False, pad_right=False,
|
44 |
-
left_pad_symbol=None, right_pad_symbol=None):
|
45 |
-
"""
|
46 |
-
Return the ngrams generated from a sequence of items, as an iterator.
|
47 |
-
For example:
|
48 |
-
|
49 |
-
>>> from nltk.util import ngrams
|
50 |
-
>>> list(ngrams([1,2,3,4,5], 3))
|
51 |
-
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
|
52 |
-
|
53 |
-
Wrap with list for a list version of this function. Set pad_left
|
54 |
-
or pad_right to true in order to get additional ngrams:
|
55 |
-
|
56 |
-
>>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
|
57 |
-
[(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
|
58 |
-
>>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
|
59 |
-
[(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
|
60 |
-
>>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
|
61 |
-
[('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
|
62 |
-
>>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
|
63 |
-
[('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
|
64 |
-
|
65 |
-
|
66 |
-
:param sequence: the source data to be converted into ngrams
|
67 |
-
:type sequence: sequence or iter
|
68 |
-
:param n: the degree of the ngrams
|
69 |
-
:type n: int
|
70 |
-
:param pad_left: whether the ngrams should be left-padded
|
71 |
-
:type pad_left: bool
|
72 |
-
:param pad_right: whether the ngrams should be right-padded
|
73 |
-
:type pad_right: bool
|
74 |
-
:param left_pad_symbol: the symbol to use for left padding (default is None)
|
75 |
-
:type left_pad_symbol: any
|
76 |
-
:param right_pad_symbol: the symbol to use for right padding (default is None)
|
77 |
-
:type right_pad_symbol: any
|
78 |
-
:rtype: sequence or iter
|
79 |
-
"""
|
80 |
-
sequence = pad_sequence(sequence, n, pad_left, pad_right,
|
81 |
-
left_pad_symbol, right_pad_symbol)
|
82 |
-
|
83 |
-
history = []
|
84 |
-
while n > 1:
|
85 |
-
history.append(next(sequence))
|
86 |
-
n -= 1
|
87 |
-
for item in sequence:
|
88 |
-
history.append(item)
|
89 |
-
yield tuple(history)
|
90 |
-
del history[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/setup.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
from setuptools import setup
|
2 |
-
|
3 |
-
__version__ = '0.4.0'
|
4 |
-
|
5 |
-
setup(
|
6 |
-
name='Distinct_N',
|
7 |
-
version=__version__,
|
8 |
-
description='Distinct-N metric that measures degree of diversity of generated response',
|
9 |
-
url='https://github.com/neural-dialogue-metrics/Distinct-N.git',
|
10 |
-
author='cgsdfc',
|
11 |
-
author_email='[email protected]',
|
12 |
-
keywords=[
|
13 |
-
'NL', 'CL', 'MT',
|
14 |
-
'natural language processing',
|
15 |
-
'computational linguistics',
|
16 |
-
'machine translation',
|
17 |
-
],
|
18 |
-
packages=['distinct_n'],
|
19 |
-
scripts=['bin/distinct_metric.py'],
|
20 |
-
classifiers=[
|
21 |
-
'Intended Audience :: Science/Research',
|
22 |
-
'License :: OSI Approved :: Apache-v2',
|
23 |
-
'Programming Language :: Python :: 3',
|
24 |
-
'Topic :: Text Processing :: Linguistic',
|
25 |
-
],
|
26 |
-
license='LICENCE.txt',
|
27 |
-
long_description=open('README.md').read(),
|
28 |
-
install_requires=[],
|
29 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/distinct_n/testdata/bigram.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
the cat sat on the mat
|
|
|
|
src/distinct_n/testdata/unigram.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
the the the the a
|
|
|
|