Source code for nlpaug.augmenter.char.random

# Source: https://arxiv.org/pdf/1711.02173.pdf

"""
    Augmenter that apply random character error to textual input.
"""

import string

from nlpaug.augmenter.char import CharAugmenter
from nlpaug.util import Action, Method, Doc


[docs]class RandomCharAug(CharAugmenter):
    # https://arxiv.org/pdf/1711.02173.pdf
    """
    Augmenter that generate character error by random values. For example, people may type i as o incorrectly.

    :param str action: Possible values are 'insert', 'substitute', 'swap' and 'delete'. If value is 'insert', a new
        character will be injected to randomly. If value is 'substitute', a random character will be replaced
        original character randomly. If value is 'swap', adjacent characters within sample word will be swapped
        randomly. If value is 'delete', character will be removed randomly.
    :param float aug_char_p: Percentage of character (per token) will be augmented.
    :param int aug_char_min: Minimum number of character will be augmented.
    :param int aug_char_max: Maximum number of character will be augmented. If None is passed, number of augmentation is
        calculated via aup_char_p. If calculated result from aug_char_p is smaller than aug_char_max, will use calculated result
        from aup_char_p. Otherwise, using aug_max.
    :param float aug_word_p: Percentage of word will be augmented.
    :param int aug_word_min: Minimum number of word will be augmented.
    :param int aug_word_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
        calculated via aup_word_p. If calculated result from aug_word_p is smaller than aug_word_max, will use calculated result
        from aug_word_p. Otherwise, using aug_max.
    :param bool include_upper_case: If True, upper case character may be included in augmented data. If `candidates'
        value is provided, this param will be ignored.
    :param bool include_lower_case: If True, lower case character may be included in augmented data. If `candidates'
        value is provided, this param will be ignored.
    :param bool include_numeric: If True, numeric character may be included in augmented data. If `candidates'
        value is provided, this param will be ignored.
    :param int min_char: If word less than this value, do not draw word for augmentation
    :param swap_mode: When action is 'swap', you may pass 'adjacent', 'middle' or 'random'. 'adjacent' means swap action
        only consider adjacent character (within same word). 'middle' means swap action consider adjacent character but
        not the first and last character of word. 'random' means swap action will be executed without constraint.
    :param str spec_char: Special character may be included in augmented data. If `candidates'
        value is provided, this param will be ignored.
    :param list stopwords: List of words which will be skipped from augment operation.
    :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
    :param func tokenizer: Customize tokenization process
    :param func reverse_tokenizer: Customize reverse of tokenization process
    :param List candidates: List of string for augmentation. E.g. ['AAA', '11', '===']. If values is provided,
        `include_upper_case`, `include_lower_case`, `include_numeric` and `spec_char` will be ignored.
    :param str name: Name of this augmenter.

    >>> import nlpaug.augmenter.char as nac
    >>> aug = nac.RandomCharAug()
    """

    def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
                 aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True,
                 include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None,
                 tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidates=None):
        super().__init__(
            action=action, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
            aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
            tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
            verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=True, include_detail=False)

        self.include_upper_case = include_upper_case
        self.include_lower_case = include_lower_case
        self.include_numeric = include_numeric
        self.swap_mode = swap_mode
        self.spec_char = spec_char
        self.candidates = candidates

        self.model = self.get_model()

    def insert(self, data):
        if not data or not data.strip():
            return data

        change_seq = 0

        doc = Doc(data, self.tokenizer(data))
        aug_word_idxes = self._get_aug_idxes(
            doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD)

        if aug_word_idxes is None:
            return data

        for token_i, token in enumerate(doc.get_original_tokens()):
            if token_i not in aug_word_idxes:
                continue

            chars = self.token2char(token)
            aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p,
                                                 Method.CHAR)
            if aug_char_idxes is None:
                continue

            aug_char_idxes.sort(reverse=True)
            for char_i in aug_char_idxes:
                chars.insert(char_i, self.sample(self.model, 1)[0])

            # No capitalization alignment as this augmenter try to simulate random error

            new_token = ''.join(chars)
            change_seq += 1
            doc.add_change_log(token_i, new_token=new_token, action=Action.INSERT,
                                  change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())

    def substitute(self, data):
        if not data or not data.strip():
            return data

        change_seq = 0

        doc = Doc(data, self.tokenizer(data))
        aug_word_idxes = self._get_aug_idxes(
            doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD)

        if aug_word_idxes is None:
            return data

        for token_i, token in enumerate(doc.get_original_tokens()):
            if token_i not in aug_word_idxes:
                continue

            substitute_token = ''
            chars = self.token2char(token)
            aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p,
                                                 Method.CHAR)
            if aug_char_idxes is None:
                continue

            for char_i, char in enumerate(chars):
                if char_i not in aug_char_idxes:
                    substitute_token += char
                    continue

                substitute_token += self.sample(self.model, 1)[0]

            # No capitalization alignment as this augmenter try to simulate random error

            change_seq += 1
            doc.add_change_log(token_i, new_token=substitute_token, action=Action.SUBSTITUTE,
                               change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())

    def swap(self, data):
        if not data or not data.strip():
            return data

        change_seq = 0

        doc = Doc(data, self.tokenizer(data))
        aug_word_idxes = self._get_aug_idxes(
            doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD)

        if aug_word_idxes is None:
            return data

        for token_i, token in enumerate(doc.get_original_tokens()):
            if token_i not in aug_word_idxes:
                continue

            swap_token = ''
            chars = self.token2char(token)

            aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p,
                                                 Method.CHAR)
            if aug_char_idxes is None or len(aug_char_idxes) < 1:
                continue

            for char_i in aug_char_idxes:
                swap_position = self._get_swap_position(char_i, len(chars)-1, mode=self.swap_mode)
                is_original_upper, is_swap_upper = chars[char_i].isupper(), chars[swap_position].isupper()
                original_chars = chars.copy()
                chars[char_i], chars[swap_position] = original_chars[swap_position], original_chars[char_i]

                # Swap case
                if is_original_upper:
                    chars[char_i] = chars[char_i].upper()
                else:
                    chars[char_i] = chars[char_i].lower()
                if is_swap_upper:
                    chars[swap_position] = chars[swap_position].upper()
                else:
                    chars[swap_position] = chars[swap_position].lower()

                swap_token += self.sample(self.model, 1)[0]

            # No capitalization alignment as this augmenter try to simulate random error

            swap_token = ''.join(chars)
            change_seq += 1
            doc.add_change_log(token_i, new_token=swap_token, action=Action.SWAP,
                               change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())

    def delete(self, data):
        if not data or not data.strip():
            return data
            
        change_seq = 0

        doc = Doc(data, self.tokenizer(data))
        aug_word_idxes = self._get_aug_idxes(
            doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD)

        if aug_word_idxes is None:
            return data

        for token_i, token in enumerate(doc.get_original_tokens()):
            if token_i not in aug_word_idxes:
                continue

            chars = self.token2char(token)
            aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p,
                                                 Method.CHAR)
            if aug_char_idxes is None or len(aug_char_idxes) < 1:
                continue

            aug_char_idxes.sort(reverse=True)
            for i in aug_char_idxes:
                del chars[i]

            # No capitalization alignment as this augmenter try to simulate random error

            delete_token = ''.join(chars)
            change_seq += 1
            doc.add_change_log(token_i, new_token=delete_token, action=Action.DELETE,
                               change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())

    def get_model(self):
        if self.candidates:
            return self.candidates

        candidates = []
        if self.include_upper_case:
            candidates += string.ascii_uppercase
        if self.include_lower_case:
            candidates += string.ascii_lowercase
        if self.include_numeric:
            candidates += string.digits
        if self.spec_char:
            candidates += self.spec_char

        return candidates

    def _get_swap_position(self, pos, token_length, mode='adjacent'):
        if mode == 'adjacent':
            if pos == 0:
                # Force swap with next character if it is first character
                return pos + 1
            elif pos == token_length:
                # Force swap with previous character if it is last character
                return pos - 1
            else:
                return pos + self.sample([-1, 1], 1)[0]
        elif mode == 'middle':
            # Middle Random: https://arxiv.org/pdf/1711.02173.pdf
            candidates = [_ for _ in range(token_length) if _ not in [0, pos, token_length]]
            if len(candidates) == 0:
                return pos
            return self.sample(candidates, 1)[0]
        elif mode == 'random':
            # Fully Random: https://arxiv.org/pdf/1711.02173.pdf
            candidates = [_ for _ in range(token_length) if _ not in [pos]]
            if len(candidates) < 1:
                return pos
            return self.sample(candidates, 1)[0]