Source code for nlpaug.augmenter.sentence.random

"""
    Augmenter that apply operation (sentence level) to textual input based on abstractive summarization.
"""

import os


from nlpaug.augmenter.sentence import SentenceAugmenter
import nlpaug.model.word_rule as nmr
from nlpaug.util import Action, Doc


[docs]class RandomSentAug(SentenceAugmenter): """ Augmenter that apply randomly behavior for augmentation. :param str mode: Shuffle sentence to left, right, neighbor or random position. For `left`, target sentence will be swapped with left sentnece. For `right`, target sentence will be swapped with right sentnece. For `neighbor`, target sentence will be swapped with left or right sentnece radomly. For `random`, target sentence will be swapped with any sentnece randomly. :param float aug_p: Percentage of sentence will be augmented. :param int aug_min: Minimum number of sentence will be augmented. :param int aug_max: Maximum number of sentence will be augmented. If None is passed, number of augmentation is calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from aug_p. Otherwise, using aug_max. :param func tokenizer: Customize tokenization process :param str name: Name of this augmenter >>> import nlpaug.augmenter.sentence as nas >>> aug = nas.RandomSentAug() """ def __init__(self, mode='neighbor', action=Action.SWAP, name='RandomSent_Aug', aug_min=1, aug_max=10, aug_p=0.3, tokenizer=None, verbose=0): super().__init__( action=action, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, verbose=verbose) self.model = nmr.Shuffle(mode=mode, model_type='sentence', tokenizer=tokenizer) def pre_skip_aug(self, data): return list(range(len(data))) # https://arxiv.org/abs/1910.13461 def swap(self, data): if not data: return data if isinstance(data, list): all_data = data else: if data.strip() == '': return data all_data = [data] for i, d in enumerate(all_data): sentences = self.model.tokenize(d) aug_idxes = self._get_random_aug_idxes(sentences) for aug_idx in aug_idxes: sentences = self.model.predict(sentences, aug_idx) all_data[i] = ' '.join(sentences) # TODO: always return array if isinstance(data, list): return all_data else: return all_data[0]