andreslu commited on
Commit
cbed089
·
1 Parent(s): 2d9786d

Delete src/distinct_n

Browse files
src/distinct_n/.gitignore DELETED
@@ -1,58 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- state.py
5
-
6
- # C extensions
7
- *.so
8
-
9
- # Distribution / packaging
10
- .Python
11
- env/
12
- build/
13
- develop-eggs/
14
- dist/
15
- downloads/
16
- eggs/
17
- .eggs/
18
- lib/
19
- lib64/
20
- parts/
21
- sdist/
22
- var/
23
- *.egg-info/
24
- .installed.cfg
25
- *.egg
26
-
27
- # PyInstaller
28
- # Usually these files are written by a python script from a template
29
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
30
- *.manifest
31
- *.spec
32
-
33
- # Installer logs
34
- pip-log.txt
35
- pip-delete-this-directory.txt
36
-
37
- # Unit test / coverage reports
38
- htmlcov/
39
- .tox/
40
- .coverage
41
- .coverage.*
42
- .cache
43
- nosetests.xml
44
- coverage.xml
45
- *,cover
46
-
47
- # Translations
48
- *.mo
49
- *.pot
50
-
51
- # Django stuff:
52
- *.log
53
-
54
- # Sphinx documentation
55
- docs/_build/
56
-
57
- # PyBuilder
58
- target/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/.idea/Distinct-N.iml DELETED
@@ -1,11 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <module type="PYTHON_MODULE" version="4">
3
- <component name="NewModuleRootManager">
4
- <content url="file://$MODULE_DIR$">
5
- <sourceFolder url="file://$MODULE_DIR$/distinct_n" isTestSource="false" />
6
- <excludeFolder url="file://$MODULE_DIR$/docs" />
7
- </content>
8
- <orderEntry type="jdk" jdkName="Python 3.6 (Metrics)" jdkType="Python SDK" />
9
- <orderEntry type="sourceFolder" forTests="false" />
10
- </component>
11
- </module>
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/.idea/encodings.xml DELETED
@@ -1,4 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4
- </project>
 
 
 
 
 
src/distinct_n/.idea/misc.xml DELETED
@@ -1,7 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="JavaScriptSettings">
4
- <option name="languageLevel" value="ES6" />
5
- </component>
6
- <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (tensorflow)" project-jdk-type="Python SDK" />
7
- </project>
 
 
 
 
 
 
 
 
src/distinct_n/.idea/modules.xml DELETED
@@ -1,8 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectModuleManager">
4
- <modules>
5
- <module fileurl="file://$PROJECT_DIR$/.idea/Distinct-N.iml" filepath="$PROJECT_DIR$/.idea/Distinct-N.iml" />
6
- </modules>
7
- </component>
8
- </project>
 
 
 
 
 
 
 
 
 
src/distinct_n/.idea/other.xml DELETED
@@ -1,6 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="PySciProjectComponent">
4
- <option name="PY_SCI_VIEW_SUGGESTED" value="true" />
5
- </component>
6
- </project>
 
 
 
 
 
 
 
src/distinct_n/.idea/vcs.xml DELETED
@@ -1,6 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="VcsDirectoryMappings">
4
- <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
- </component>
6
- </project>
 
 
 
 
 
 
 
src/distinct_n/.idea/webResources.xml DELETED
@@ -1,14 +0,0 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="WebResourcesPaths">
4
- <contentEntries>
5
- <entry url="file://$PROJECT_DIR$">
6
- <entryData>
7
- <resourceRoots>
8
- <path value="file://$PROJECT_DIR$/testdata" />
9
- </resourceRoots>
10
- </entryData>
11
- </entry>
12
- </contentEntries>
13
- </component>
14
- </project>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/A Diversity-Promoting Objective Function for Neural Conversation Models.pdf DELETED
Binary file (200 kB)
 
src/distinct_n/LICENSE.txt DELETED
@@ -1,202 +0,0 @@
1
-
2
- Apache License
3
- Version 2.0, January 2004
4
- http://www.apache.org/licenses/
5
-
6
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
-
8
- 1. Definitions.
9
-
10
- "License" shall mean the terms and conditions for use, reproduction,
11
- and distribution as defined by Sections 1 through 9 of this document.
12
-
13
- "Licensor" shall mean the copyright owner or entity authorized by
14
- the copyright owner that is granting the License.
15
-
16
- "Legal Entity" shall mean the union of the acting entity and all
17
- other entities that control, are controlled by, or are under common
18
- control with that entity. For the purposes of this definition,
19
- "control" means (i) the power, direct or indirect, to cause the
20
- direction or management of such entity, whether by contract or
21
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
- outstanding shares, or (iii) beneficial ownership of such entity.
23
-
24
- "You" (or "Your") shall mean an individual or Legal Entity
25
- exercising permissions granted by this License.
26
-
27
- "Source" form shall mean the preferred form for making modifications,
28
- including but not limited to software source code, documentation
29
- source, and configuration files.
30
-
31
- "Object" form shall mean any form resulting from mechanical
32
- transformation or translation of a Source form, including but
33
- not limited to compiled object code, generated documentation,
34
- and conversions to other media types.
35
-
36
- "Work" shall mean the work of authorship, whether in Source or
37
- Object form, made available under the License, as indicated by a
38
- copyright notice that is included in or attached to the work
39
- (an example is provided in the Appendix below).
40
-
41
- "Derivative Works" shall mean any work, whether in Source or Object
42
- form, that is based on (or derived from) the Work and for which the
43
- editorial revisions, annotations, elaborations, or other modifications
44
- represent, as a whole, an original work of authorship. For the purposes
45
- of this License, Derivative Works shall not include works that remain
46
- separable from, or merely link (or bind by name) to the interfaces of,
47
- the Work and Derivative Works thereof.
48
-
49
- "Contribution" shall mean any work of authorship, including
50
- the original version of the Work and any modifications or additions
51
- to that Work or Derivative Works thereof, that is intentionally
52
- submitted to Licensor for inclusion in the Work by the copyright owner
53
- or by an individual or Legal Entity authorized to submit on behalf of
54
- the copyright owner. For the purposes of this definition, "submitted"
55
- means any form of electronic, verbal, or written communication sent
56
- to the Licensor or its representatives, including but not limited to
57
- communication on electronic mailing lists, source code control systems,
58
- and issue tracking systems that are managed by, or on behalf of, the
59
- Licensor for the purpose of discussing and improving the Work, but
60
- excluding communication that is conspicuously marked or otherwise
61
- designated in writing by the copyright owner as "Not a Contribution."
62
-
63
- "Contributor" shall mean Licensor and any individual or Legal Entity
64
- on behalf of whom a Contribution has been received by Licensor and
65
- subsequently incorporated within the Work.
66
-
67
- 2. Grant of Copyright License. Subject to the terms and conditions of
68
- this License, each Contributor hereby grants to You a perpetual,
69
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
- copyright license to reproduce, prepare Derivative Works of,
71
- publicly display, publicly perform, sublicense, and distribute the
72
- Work and such Derivative Works in Source or Object form.
73
-
74
- 3. Grant of Patent License. Subject to the terms and conditions of
75
- this License, each Contributor hereby grants to You a perpetual,
76
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
- (except as stated in this section) patent license to make, have made,
78
- use, offer to sell, sell, import, and otherwise transfer the Work,
79
- where such license applies only to those patent claims licensable
80
- by such Contributor that are necessarily infringed by their
81
- Contribution(s) alone or by combination of their Contribution(s)
82
- with the Work to which such Contribution(s) was submitted. If You
83
- institute patent litigation against any entity (including a
84
- cross-claim or counterclaim in a lawsuit) alleging that the Work
85
- or a Contribution incorporated within the Work constitutes direct
86
- or contributory patent infringement, then any patent licenses
87
- granted to You under this License for that Work shall terminate
88
- as of the date such litigation is filed.
89
-
90
- 4. Redistribution. You may reproduce and distribute copies of the
91
- Work or Derivative Works thereof in any medium, with or without
92
- modifications, and in Source or Object form, provided that You
93
- meet the following conditions:
94
-
95
- (a) You must give any other recipients of the Work or
96
- Derivative Works a copy of this License; and
97
-
98
- (b) You must cause any modified files to carry prominent notices
99
- stating that You changed the files; and
100
-
101
- (c) You must retain, in the Source form of any Derivative Works
102
- that You distribute, all copyright, patent, trademark, and
103
- attribution notices from the Source form of the Work,
104
- excluding those notices that do not pertain to any part of
105
- the Derivative Works; and
106
-
107
- (d) If the Work includes a "NOTICE" text file as part of its
108
- distribution, then any Derivative Works that You distribute must
109
- include a readable copy of the attribution notices contained
110
- within such NOTICE file, excluding those notices that do not
111
- pertain to any part of the Derivative Works, in at least one
112
- of the following places: within a NOTICE text file distributed
113
- as part of the Derivative Works; within the Source form or
114
- documentation, if provided along with the Derivative Works; or,
115
- within a display generated by the Derivative Works, if and
116
- wherever such third-party notices normally appear. The contents
117
- of the NOTICE file are for informational purposes only and
118
- do not modify the License. You may add Your own attribution
119
- notices within Derivative Works that You distribute, alongside
120
- or as an addendum to the NOTICE text from the Work, provided
121
- that such additional attribution notices cannot be construed
122
- as modifying the License.
123
-
124
- You may add Your own copyright statement to Your modifications and
125
- may provide additional or different license terms and conditions
126
- for use, reproduction, or distribution of Your modifications, or
127
- for any such Derivative Works as a whole, provided Your use,
128
- reproduction, and distribution of the Work otherwise complies with
129
- the conditions stated in this License.
130
-
131
- 5. Submission of Contributions. Unless You explicitly state otherwise,
132
- any Contribution intentionally submitted for inclusion in the Work
133
- by You to the Licensor shall be under the terms and conditions of
134
- this License, without any additional terms or conditions.
135
- Notwithstanding the above, nothing herein shall supersede or modify
136
- the terms of any separate license agreement you may have executed
137
- with Licensor regarding such Contributions.
138
-
139
- 6. Trademarks. This License does not grant permission to use the trade
140
- names, trademarks, service marks, or product names of the Licensor,
141
- except as required for reasonable and customary use in describing the
142
- origin of the Work and reproducing the content of the NOTICE file.
143
-
144
- 7. Disclaimer of Warranty. Unless required by applicable law or
145
- agreed to in writing, Licensor provides the Work (and each
146
- Contributor provides its Contributions) on an "AS IS" BASIS,
147
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
- implied, including, without limitation, any warranties or conditions
149
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
- PARTICULAR PURPOSE. You are solely responsible for determining the
151
- appropriateness of using or redistributing the Work and assume any
152
- risks associated with Your exercise of permissions under this License.
153
-
154
- 8. Limitation of Liability. In no event and under no legal theory,
155
- whether in tort (including negligence), contract, or otherwise,
156
- unless required by applicable law (such as deliberate and grossly
157
- negligent acts) or agreed to in writing, shall any Contributor be
158
- liable to You for damages, including any direct, indirect, special,
159
- incidental, or consequential damages of any character arising as a
160
- result of this License or out of the use or inability to use the
161
- Work (including but not limited to damages for loss of goodwill,
162
- work stoppage, computer failure or malfunction, or any and all
163
- other commercial damages or losses), even if such Contributor
164
- has been advised of the possibility of such damages.
165
-
166
- 9. Accepting Warranty or Additional Liability. While redistributing
167
- the Work or Derivative Works thereof, You may choose to offer,
168
- and charge a fee for, acceptance of support, warranty, indemnity,
169
- or other liability obligations and/or rights consistent with this
170
- License. However, in accepting such obligations, You may act only
171
- on Your own behalf and on Your sole responsibility, not on behalf
172
- of any other Contributor, and only if You agree to indemnify,
173
- defend, and hold each Contributor harmless for any liability
174
- incurred by, or claims asserted against, such Contributor by reason
175
- of your accepting any such warranty or additional liability.
176
-
177
- END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright [yyyy] [name of copyright owner]
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/README.md DELETED
@@ -1,30 +0,0 @@
1
- # Distinct-N
2
- Distinct-N, most notably distinct-1 and distinct-2, is metric that measures the
3
- diversity of a sentence. It focuses on the number of *distinct* n-gram of a sentence and thus
4
- penalizes sentences with lots of repeated words. The metric is free of any *reference* or *ground truth*
5
- sentence and devotes totally to the property of a sentence (generated by the system).
6
- It is proposed by Jiwei Li et.al in the paper *A Diversity-Promoting Objective Function for Neural Conversation Models*.
7
-
8
- # Definitions
9
- The original paper coined *Distinct-N* as:
10
-
11
- We report degree of diversity by calculating the number of distinct unigrams and bigrams in generated responses.
12
- The value is scaled by total number of generated tokens to avoid favoring long sentences
13
-
14
- which is exactly what we have mentioned before.
15
-
16
- # Usage
17
- ```bash
18
- $ python distinct_metric.py -n N_NGRAMS PREDICTION
19
- ```
20
-
21
-
22
- where `N_GRAMS` is the length of token sequence to count as unique within one sentence.
23
- `PREDICTION` is the prediction or response your model generates with one utterance (sentence) per line.
24
-
25
-
26
- # Dependencies
27
- `python>=3.6.1`
28
-
29
- # References
30
- [1] A Diversity-Promoting Objective Function for Neural Conversation Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/bin/distinct_metric.py DELETED
@@ -1,29 +0,0 @@
1
- import argparse
2
- import logging
3
-
4
- from distinct_n import distinct_n_sentence_level
5
- from pathlib import Path
6
- from agenda.metric_helper import write_score
7
-
8
- NAME = 'distinct_n'
9
-
10
- if __name__ == '__main__':
11
- parser = argparse.ArgumentParser()
12
- parser.add_argument('hypothesis', help="predicted text file, one example per line")
13
- parser.add_argument('-n', dest='n_range', type=int, nargs='+', help="n to use as in distinct-N")
14
- parser.add_argument('--output_dir')
15
- args = parser.parse_args()
16
-
17
- logging.basicConfig(level=logging.INFO)
18
- logging.info('loading hypothesis file...')
19
- with open(args.hypothesis) as f:
20
- hypothesis = [sentence.split() for sentence in f.readlines()]
21
-
22
- output_dir = Path(args.output_dir)
23
- for n in args.n_range:
24
- write_score(
25
- name=NAME,
26
- output=output_dir.joinpath(f'{NAME}_{n}').with_suffix('.json'),
27
- params={'n': n},
28
- scores=[distinct_n_sentence_level(s, n) for s in hypothesis],
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/bin/score.sh DELETED
@@ -1,6 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- HYPO=/home/cgsdfc/UbuntuDialogueCorpus/ResponseContextPairs/ModelPredictions/VHRED/First_VHRED_BeamSearch_5_GeneratedTestResponses.txt_First.txt
4
- DIR=/home/cgsdfc/Result/Test
5
-
6
- python bin/distinct_metric.py --output_dir $DIR $HYPO -n 3
 
 
 
 
 
 
 
src/distinct_n/distinct_n/metrics.py DELETED
@@ -1,33 +0,0 @@
1
- from src.distinct_n.distinct_n.utils import ngrams
2
-
3
- __all__ = ["distinct_n_sentence_level", "distinct_n_corpus_level"]
4
-
5
-
6
- def distinct_n_sentence_level(sentence, n):
7
- """
8
- Compute distinct-N for a single sentence.
9
- :param sentence: a list of words.
10
- :param n: int, ngram.
11
- :return: float, the metric value.
12
- """
13
- if len(sentence) == 0:
14
- return 0.0 # Prevent a zero division
15
- # distinct_ngrams = set(ngrams(sentence, n))
16
- # print(ngrams(sentence, n))
17
- return list(set(ngrams(sentence, n)))
18
- # return len(distinct_ngrams) / len(sentence)
19
-
20
-
21
- def distinct_n_corpus_level(sentences, n):
22
- """
23
- Compute average distinct-N of a list of sentences (the corpus).
24
- :param sentences: a list of sentence.
25
- :param n: int, ngram.
26
- :return: float, the average value.
27
- """
28
- temp = []
29
- length = 0
30
- for sentence in sentences:
31
- length += len(sentence)
32
- temp.extend(distinct_n_sentence_level(sentence, n))
33
- return len(set(temp)) / length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/distinct_n/test.py DELETED
@@ -1,32 +0,0 @@
1
- import unittest
2
-
3
- from distinct_n import distinct_n_sentence_level
4
- from distinct_n import distinct_n_corpus_level
5
-
6
-
7
- class TestDistinctN(unittest.TestCase):
8
- def test_unigram(self):
9
- sentence = "the the the the the".split()
10
- self.assertAlmostEqual(
11
- distinct_n_sentence_level(sentence, 1), 0.2
12
- )
13
- sentence = "the the the the cat".split()
14
- self.assertAlmostEqual(
15
- distinct_n_sentence_level(sentence, 1), 0.4
16
- )
17
-
18
- def test_bigram(self):
19
- sentence = "the cat sat on the".split()
20
- self.assertAlmostEqual(
21
- distinct_n_sentence_level(sentence, 2), 0.8
22
- )
23
-
24
- def test_corpus_level(self):
25
- sentences = [
26
- 'the cat sat on the mat'.split(),
27
- 'mat the on sat cat the'.split(),
28
- 'i do not know'.split(),
29
- 'Sorry but i do not know'.split(),
30
- ]
31
- self.assertAlmostEqual(0.916666, distinct_n_corpus_level(sentences, 1), delta=1e-5)
32
- self.assertAlmostEqual(0.8125, distinct_n_corpus_level(sentences, 2), delta=1e-5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/distinct_n/utils.py DELETED
@@ -1,90 +0,0 @@
1
- """
2
- Copied from nltk.ngrams().
3
- """
4
- from itertools import chain
5
-
6
- __all__ = ["ngrams"]
7
-
8
-
9
- def pad_sequence(sequence, n, pad_left=False, pad_right=False,
10
- left_pad_symbol=None, right_pad_symbol=None):
11
- """
12
- Returns a padded sequence of items before ngram extraction.
13
-
14
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
15
- ['<s>', 1, 2, 3, 4, 5, '</s>']
16
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
17
- ['<s>', 1, 2, 3, 4, 5]
18
- >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
19
- [1, 2, 3, 4, 5, '</s>']
20
-
21
- :param sequence: the source data to be padded
22
- :type sequence: sequence or iter
23
- :param n: the degree of the ngrams
24
- :type n: int
25
- :param pad_left: whether the ngrams should be left-padded
26
- :type pad_left: bool
27
- :param pad_right: whether the ngrams should be right-padded
28
- :type pad_right: bool
29
- :param left_pad_symbol: the symbol to use for left padding (default is None)
30
- :type left_pad_symbol: any
31
- :param right_pad_symbol: the symbol to use for right padding (default is None)
32
- :type right_pad_symbol: any
33
- :rtype: sequence or iter
34
- """
35
- sequence = iter(sequence)
36
- if pad_left:
37
- sequence = chain((left_pad_symbol,) * (n - 1), sequence)
38
- if pad_right:
39
- sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
40
- return sequence
41
-
42
-
43
- def ngrams(sequence, n, pad_left=False, pad_right=False,
44
- left_pad_symbol=None, right_pad_symbol=None):
45
- """
46
- Return the ngrams generated from a sequence of items, as an iterator.
47
- For example:
48
-
49
- >>> from nltk.util import ngrams
50
- >>> list(ngrams([1,2,3,4,5], 3))
51
- [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
52
-
53
- Wrap with list for a list version of this function. Set pad_left
54
- or pad_right to true in order to get additional ngrams:
55
-
56
- >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
57
- [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
58
- >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
59
- [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
60
- >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
61
- [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
62
- >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
63
- [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
64
-
65
-
66
- :param sequence: the source data to be converted into ngrams
67
- :type sequence: sequence or iter
68
- :param n: the degree of the ngrams
69
- :type n: int
70
- :param pad_left: whether the ngrams should be left-padded
71
- :type pad_left: bool
72
- :param pad_right: whether the ngrams should be right-padded
73
- :type pad_right: bool
74
- :param left_pad_symbol: the symbol to use for left padding (default is None)
75
- :type left_pad_symbol: any
76
- :param right_pad_symbol: the symbol to use for right padding (default is None)
77
- :type right_pad_symbol: any
78
- :rtype: sequence or iter
79
- """
80
- sequence = pad_sequence(sequence, n, pad_left, pad_right,
81
- left_pad_symbol, right_pad_symbol)
82
-
83
- history = []
84
- while n > 1:
85
- history.append(next(sequence))
86
- n -= 1
87
- for item in sequence:
88
- history.append(item)
89
- yield tuple(history)
90
- del history[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/setup.py DELETED
@@ -1,29 +0,0 @@
1
- from setuptools import setup
2
-
3
- __version__ = '0.4.0'
4
-
5
- setup(
6
- name='Distinct_N',
7
- version=__version__,
8
- description='Distinct-N metric that measures degree of diversity of generated response',
9
- url='https://github.com/neural-dialogue-metrics/Distinct-N.git',
10
- author='cgsdfc',
11
- author_email='[email protected]',
12
- keywords=[
13
- 'NL', 'CL', 'MT',
14
- 'natural language processing',
15
- 'computational linguistics',
16
- 'machine translation',
17
- ],
18
- packages=['distinct_n'],
19
- scripts=['bin/distinct_metric.py'],
20
- classifiers=[
21
- 'Intended Audience :: Science/Research',
22
- 'License :: OSI Approved :: Apache-v2',
23
- 'Programming Language :: Python :: 3',
24
- 'Topic :: Text Processing :: Linguistic',
25
- ],
26
- license='LICENCE.txt',
27
- long_description=open('README.md').read(),
28
- install_requires=[],
29
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/distinct_n/testdata/bigram.txt DELETED
@@ -1 +0,0 @@
1
- the cat sat on the mat
 
 
src/distinct_n/testdata/unigram.txt DELETED
@@ -1 +0,0 @@
1
- the the the the a