| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | from itertools import chain
|
| |
|
| | def pad_sequence(
|
| | sequence,
|
| | n,
|
| | pad_left=False,
|
| | pad_right=False,
|
| | left_pad_symbol=None,
|
| | right_pad_symbol=None,
|
| | ):
|
| | """
|
| | Returns a padded sequence of items before ngram extraction.
|
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
|
| | ['<s>', 1, 2, 3, 4, 5, '</s>']
|
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
|
| | ['<s>', 1, 2, 3, 4, 5]
|
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
|
| | [1, 2, 3, 4, 5, '</s>']
|
| | :param sequence: the source data to be padded
|
| | :type sequence: sequence or iter
|
| | :param n: the degree of the ngrams
|
| | :type n: int
|
| | :param pad_left: whether the ngrams should be left-padded
|
| | :type pad_left: bool
|
| | :param pad_right: whether the ngrams should be right-padded
|
| | :type pad_right: bool
|
| | :param left_pad_symbol: the symbol to use for left padding (default is None)
|
| | :type left_pad_symbol: any
|
| | :param right_pad_symbol: the symbol to use for right padding (default is None)
|
| | :type right_pad_symbol: any
|
| | :rtype: sequence or iter
|
| | """
|
| | sequence = iter(sequence)
|
| | if pad_left:
|
| | sequence = chain((left_pad_symbol,) * (n - 1), sequence)
|
| | if pad_right:
|
| | sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
|
| | return sequence
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def ngrams(
|
| | sequence,
|
| | n,
|
| | pad_left=False,
|
| | pad_right=False,
|
| | left_pad_symbol=None,
|
| | right_pad_symbol=None,
|
| | ):
|
| | """
|
| | Return the ngrams generated from a sequence of items, as an iterator.
|
| | For example:
|
| | >>> from nltk.util import ngrams
|
| | >>> list(ngrams([1,2,3,4,5], 3))
|
| | [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
|
| | Wrap with list for a list version of this function. Set pad_left
|
| | or pad_right to true in order to get additional ngrams:
|
| | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
|
| | [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
|
| | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
|
| | [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
|
| | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
|
| | [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
|
| | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
|
| | [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
|
| | :param sequence: the source data to be converted into ngrams
|
| | :type sequence: sequence or iter
|
| | :param n: the degree of the ngrams
|
| | :type n: int
|
| | :param pad_left: whether the ngrams should be left-padded
|
| | :type pad_left: bool
|
| | :param pad_right: whether the ngrams should be right-padded
|
| | :type pad_right: bool
|
| | :param left_pad_symbol: the symbol to use for left padding (default is None)
|
| | :type left_pad_symbol: any
|
| | :param right_pad_symbol: the symbol to use for right padding (default is None)
|
| | :type right_pad_symbol: any
|
| | :rtype: sequence or iter
|
| | """
|
| | sequence = pad_sequence(
|
| | sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
|
| | )
|
| |
|
| | history = []
|
| | while n > 1:
|
| |
|
| | try:
|
| | next_item = next(sequence)
|
| | except StopIteration:
|
| |
|
| | return
|
| | history.append(next_item)
|
| | n -= 1
|
| | for item in sequence:
|
| | history.append(item)
|
| | yield tuple(history)
|
| | del history[0] |