giulio98 commited on
Commit
1056a0c
·
1 Parent(s): 81159f3

Update weighted_ngram_match.py

Browse files
Files changed (1) hide show
  1. weighted_ngram_match.py +100 -2
weighted_ngram_match.py CHANGED
@@ -17,10 +17,108 @@ import sys
17
  from fractions import Fraction
18
  import warnings
19
  from collections import Counter
20
-
21
- from utils import ngrams
22
  import pdb
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def sentence_bleu(
26
  references,
 
17
  from fractions import Fraction
18
  import warnings
19
  from collections import Counter
 
 
20
  import pdb
21
 
22
+ from itertools import chain
23
+
24
+ def pad_sequence(
25
+ sequence,
26
+ n,
27
+ pad_left=False,
28
+ pad_right=False,
29
+ left_pad_symbol=None,
30
+ right_pad_symbol=None,
31
+ ):
32
+ """
33
+ Returns a padded sequence of items before ngram extraction.
34
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
35
+ ['<s>', 1, 2, 3, 4, 5, '</s>']
36
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
37
+ ['<s>', 1, 2, 3, 4, 5]
38
+ >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
39
+ [1, 2, 3, 4, 5, '</s>']
40
+ :param sequence: the source data to be padded
41
+ :type sequence: sequence or iter
42
+ :param n: the degree of the ngrams
43
+ :type n: int
44
+ :param pad_left: whether the ngrams should be left-padded
45
+ :type pad_left: bool
46
+ :param pad_right: whether the ngrams should be right-padded
47
+ :type pad_right: bool
48
+ :param left_pad_symbol: the symbol to use for left padding (default is None)
49
+ :type left_pad_symbol: any
50
+ :param right_pad_symbol: the symbol to use for right padding (default is None)
51
+ :type right_pad_symbol: any
52
+ :rtype: sequence or iter
53
+ """
54
+ sequence = iter(sequence)
55
+ if pad_left:
56
+ sequence = chain((left_pad_symbol,) * (n - 1), sequence)
57
+ if pad_right:
58
+ sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
59
+ return sequence
60
+
61
+
62
+ # add a flag to pad the sequence so we get peripheral ngrams?
63
+
64
+
65
+ def ngrams(
66
+ sequence,
67
+ n,
68
+ pad_left=False,
69
+ pad_right=False,
70
+ left_pad_symbol=None,
71
+ right_pad_symbol=None,
72
+ ):
73
+ """
74
+ Return the ngrams generated from a sequence of items, as an iterator.
75
+ For example:
76
+ >>> from nltk.util import ngrams
77
+ >>> list(ngrams([1,2,3,4,5], 3))
78
+ [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
79
+ Wrap with list for a list version of this function. Set pad_left
80
+ or pad_right to true in order to get additional ngrams:
81
+ >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
82
+ [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
83
+ >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
84
+ [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
85
+ >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
86
+ [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
87
+ >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
88
+ [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
89
+ :param sequence: the source data to be converted into ngrams
90
+ :type sequence: sequence or iter
91
+ :param n: the degree of the ngrams
92
+ :type n: int
93
+ :param pad_left: whether the ngrams should be left-padded
94
+ :type pad_left: bool
95
+ :param pad_right: whether the ngrams should be right-padded
96
+ :type pad_right: bool
97
+ :param left_pad_symbol: the symbol to use for left padding (default is None)
98
+ :type left_pad_symbol: any
99
+ :param right_pad_symbol: the symbol to use for right padding (default is None)
100
+ :type right_pad_symbol: any
101
+ :rtype: sequence or iter
102
+ """
103
+ sequence = pad_sequence(
104
+ sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
105
+ )
106
+
107
+ history = []
108
+ while n > 1:
109
+ # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
110
+ try:
111
+ next_item = next(sequence)
112
+ except StopIteration:
113
+ # no more data, terminate the generator
114
+ return
115
+ history.append(next_item)
116
+ n -= 1
117
+ for item in sequence:
118
+ history.append(item)
119
+ yield tuple(history)
120
+ del history[0]
121
+
122
 
123
  def sentence_bleu(
124
  references,