Source code for nlpaug.augmenter.sentence.random

"""
    Augmenter that apply operation (sentence level) to textual input based on abstractive summarization.
"""

import os


from nlpaug.augmenter.sentence import SentenceAugmenter
import nlpaug.model.word_rule as nmr
from nlpaug.util import Action, Doc


[docs]class RandomSentAug(SentenceAugmenter):

    """
    Augmenter that apply randomly behavior for augmentation.

    :param str mode: Shuffle sentence to left, right, neighbor or random position. For `left`, target sentence
        will be swapped with left sentnece. For `right`, target sentence will be swapped with right sentnece.
        For `neighbor`, target sentence will be swapped with left or right sentnece radomly. For `random`, 
        target sentence will be swapped with any sentnece randomly.
    :param float aug_p: Percentage of sentence will be augmented. 
    :param int aug_min: Minimum number of sentence will be augmented.
    :param int aug_max: Maximum number of sentence will be augmented. If None is passed, number of augmentation is
        calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
        aug_p. Otherwise, using aug_max.
    :param func tokenizer: Customize tokenization process
    :param str name: Name of this augmenter

    >>> import nlpaug.augmenter.sentence as nas
    >>> aug = nas.RandomSentAug()
    """

    def __init__(self, mode='neighbor', action=Action.SWAP, name='RandomSent_Aug', aug_min=1, aug_max=10, aug_p=0.3,
        tokenizer=None, verbose=0):
        super().__init__(
            action=action, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, verbose=verbose)

        self.model = nmr.Shuffle(mode=mode, model_type='sentence', tokenizer=tokenizer)

    def pre_skip_aug(self, data):
        return list(range(len(data)))
        
    # https://arxiv.org/abs/1910.13461
    def swap(self, data):
        if not data:
            return data

        if isinstance(data, list):
            all_data = data
        else:
            if data.strip() == '':
                return data
            all_data = [data]

        for i, d in enumerate(all_data):
            sentences = self.model.tokenize(d)
            aug_idxes = self._get_random_aug_idxes(sentences)
            for aug_idx in aug_idxes:
                sentences = self.model.predict(sentences, aug_idx)
            all_data[i] = ' '.join(sentences)

        # TODO: always return array
        if isinstance(data, list):
            return all_data
        else:
            return all_data[0]