Source code for nlpaug.augmenter.word.split

# Source: https://arxiv.org/pdf/1812.05271v1.pdf

"""
    Augmenter that apply word splitting operation to textual input.
"""

from nlpaug.augmenter.word import WordAugmenter
from nlpaug.util import Action, Doc


[docs]class SplitAug(WordAugmenter):
    """
    Augmenter that apply word splitting for augmentation.

    :param float aug_p: Percentage of word will be augmented.
    :param int aug_min: Minimum number of word will be augmented.
    :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
        calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
        aug_p. Otherwise, using aug_max.
    :param int min_char: If word less than this value, do not draw word for augmentation
    :param list stopwords: List of words which will be skipped from augment operation.
    :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
    :param func tokenizer: Customize tokenization process
    :param func reverse_tokenizer: Customize reverse of tokenization process
    :param str name: Name of this augmenter

    >>> import nlpaug.augmenter.word as naw
    >>> aug = naw.SplitAug()
    """

    def __init__(self, name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None,
                 tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=0):
        super().__init__(
            action=Action.SPLIT, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords,
            tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose,
            stopwords_regex=stopwords_regex, include_detail=False)

        self.min_char = min_char

    def skip_aug(self, token_idxes, tokens):
        results = []
        for token_idx in token_idxes:
            if len(tokens[token_idx]) >= self.min_char:
                results.append(token_idx)
        return results

    def split(self, data):
        if not data or not data.strip():
            return data
            
        change_seq = 0
        doc = Doc(data, self.tokenizer(data))

        aug_idxes = self._get_aug_idxes(doc.get_original_tokens())
        aug_idxes.sort(reverse=True)

        if aug_idxes is None or len(aug_idxes) == 0:
            if self.include_detail:
                return data, []
            return data

        for aug_idx in aug_idxes:
            target_token = doc.get_token(aug_idx).get_latest_token().token
            separate_pos = self.sample(len(target_token), 1)
            prev_token = target_token[:separate_pos]
            next_token = target_token[separate_pos:]

            change_seq += 1
            doc.add_change_log(aug_idx, new_token=next_token, action=Action.SPLIT,
                               change_seq=self.parent_change_seq + change_seq)
            doc.add_token(aug_idx, token=prev_token, action=Action.SPLIT,
                          change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())