Source code for nlpaug.augmenter.char.random

# Source: https://arxiv.org/pdf/1711.02173.pdf

"""
    Augmenter that apply random character error to textual input.
"""

import string

from nlpaug.augmenter.char import CharAugmenter
from nlpaug.util import Action, Method, Doc


[docs]class RandomCharAug(CharAugmenter): # https://arxiv.org/pdf/1711.02173.pdf """ Augmenter that generate character error by random values. For example, people may type i as o incorrectly. :param str action: Possible values are 'insert', 'substitute', 'swap' and 'delete'. If value is 'insert', a new character will be injected to randomly. If value is 'substitute', a random character will be replaced original character randomly. If value is 'swap', adjacent characters within sample word will be swapped randomly. If value is 'delete', character will be removed randomly. :param float aug_char_p: Percentage of character (per token) will be augmented. :param int aug_char_min: Minimum number of character will be augmented. :param int aug_char_max: Maximum number of character will be augmented. If None is passed, number of augmentation is calculated via aup_char_p. If calculated result from aug_char_p is smaller than aug_char_max, will use calculated result from aup_char_p. Otherwise, using aug_max. :param float aug_word_p: Percentage of word will be augmented. :param int aug_word_min: Minimum number of word will be augmented. :param int aug_word_max: Maximum number of word will be augmented. If None is passed, number of augmentation is calculated via aup_word_p. If calculated result from aug_word_p is smaller than aug_word_max, will use calculated result from aug_word_p. Otherwise, using aug_max. :param bool include_upper_case: If True, upper case character may be included in augmented data. If `candidates' value is provided, this param will be ignored. :param bool include_lower_case: If True, lower case character may be included in augmented data. If `candidates' value is provided, this param will be ignored. :param bool include_numeric: If True, numeric character may be included in augmented data. If `candidates' value is provided, this param will be ignored. :param int min_char: If word less than this value, do not draw word for augmentation :param swap_mode: When action is 'swap', you may pass 'adjacent', 'middle' or 'random'. 'adjacent' means swap action only consider adjacent character (within same word). 'middle' means swap action consider adjacent character but not the first and last character of word. 'random' means swap action will be executed without constraint. :param str spec_char: Special character may be included in augmented data. If `candidates' value is provided, this param will be ignored. :param list stopwords: List of words which will be skipped from augment operation. :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation. :param func tokenizer: Customize tokenization process :param func reverse_tokenizer: Customize reverse of tokenization process :param List candidates: List of string for augmentation. E.g. ['AAA', '11', '===']. If values is provided, `include_upper_case`, `include_lower_case`, `include_numeric` and `spec_char` will be ignored. :param str name: Name of this augmenter. >>> import nlpaug.augmenter.char as nac >>> aug = nac.RandomCharAug() """ def __init__(self, action=Action.SUBSTITUTE, name='RandomChar_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidates=None): super().__init__( action=action, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max, aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p, tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu', verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=True, include_detail=False) self.include_upper_case = include_upper_case self.include_lower_case = include_lower_case self.include_numeric = include_numeric self.swap_mode = swap_mode self.spec_char = spec_char self.candidates = candidates self.model = self.get_model() def insert(self, data): if not data or not data.strip(): return data change_seq = 0 doc = Doc(data, self.tokenizer(data)) aug_word_idxes = self._get_aug_idxes( doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD) if aug_word_idxes is None: return data for token_i, token in enumerate(doc.get_original_tokens()): if token_i not in aug_word_idxes: continue chars = self.token2char(token) aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p, Method.CHAR) if aug_char_idxes is None: continue aug_char_idxes.sort(reverse=True) for char_i in aug_char_idxes: chars.insert(char_i, self.sample(self.model, 1)[0]) # No capitalization alignment as this augmenter try to simulate random error new_token = ''.join(chars) change_seq += 1 doc.add_change_log(token_i, new_token=new_token, action=Action.INSERT, change_seq=self.parent_change_seq + change_seq) if self.include_detail: return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs() else: return self.reverse_tokenizer(doc.get_augmented_tokens()) def substitute(self, data): if not data or not data.strip(): return data change_seq = 0 doc = Doc(data, self.tokenizer(data)) aug_word_idxes = self._get_aug_idxes( doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD) if aug_word_idxes is None: return data for token_i, token in enumerate(doc.get_original_tokens()): if token_i not in aug_word_idxes: continue substitute_token = '' chars = self.token2char(token) aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p, Method.CHAR) if aug_char_idxes is None: continue for char_i, char in enumerate(chars): if char_i not in aug_char_idxes: substitute_token += char continue substitute_token += self.sample(self.model, 1)[0] # No capitalization alignment as this augmenter try to simulate random error change_seq += 1 doc.add_change_log(token_i, new_token=substitute_token, action=Action.SUBSTITUTE, change_seq=self.parent_change_seq + change_seq) if self.include_detail: return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs() else: return self.reverse_tokenizer(doc.get_augmented_tokens()) def swap(self, data): if not data or not data.strip(): return data change_seq = 0 doc = Doc(data, self.tokenizer(data)) aug_word_idxes = self._get_aug_idxes( doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD) if aug_word_idxes is None: return data for token_i, token in enumerate(doc.get_original_tokens()): if token_i not in aug_word_idxes: continue swap_token = '' chars = self.token2char(token) aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p, Method.CHAR) if aug_char_idxes is None or len(aug_char_idxes) < 1: continue for char_i in aug_char_idxes: swap_position = self._get_swap_position(char_i, len(chars)-1, mode=self.swap_mode) is_original_upper, is_swap_upper = chars[char_i].isupper(), chars[swap_position].isupper() original_chars = chars.copy() chars[char_i], chars[swap_position] = original_chars[swap_position], original_chars[char_i] # Swap case if is_original_upper: chars[char_i] = chars[char_i].upper() else: chars[char_i] = chars[char_i].lower() if is_swap_upper: chars[swap_position] = chars[swap_position].upper() else: chars[swap_position] = chars[swap_position].lower() swap_token += self.sample(self.model, 1)[0] # No capitalization alignment as this augmenter try to simulate random error swap_token = ''.join(chars) change_seq += 1 doc.add_change_log(token_i, new_token=swap_token, action=Action.SWAP, change_seq=self.parent_change_seq + change_seq) if self.include_detail: return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs() else: return self.reverse_tokenizer(doc.get_augmented_tokens()) def delete(self, data): if not data or not data.strip(): return data change_seq = 0 doc = Doc(data, self.tokenizer(data)) aug_word_idxes = self._get_aug_idxes( doc.get_original_tokens(), self.aug_word_min, self.aug_word_max, self.aug_word_p, Method.WORD) if aug_word_idxes is None: return data for token_i, token in enumerate(doc.get_original_tokens()): if token_i not in aug_word_idxes: continue chars = self.token2char(token) aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max, self.aug_char_p, Method.CHAR) if aug_char_idxes is None or len(aug_char_idxes) < 1: continue aug_char_idxes.sort(reverse=True) for i in aug_char_idxes: del chars[i] # No capitalization alignment as this augmenter try to simulate random error delete_token = ''.join(chars) change_seq += 1 doc.add_change_log(token_i, new_token=delete_token, action=Action.DELETE, change_seq=self.parent_change_seq + change_seq) if self.include_detail: return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs() else: return self.reverse_tokenizer(doc.get_augmented_tokens()) def get_model(self): if self.candidates: return self.candidates candidates = [] if self.include_upper_case: candidates += string.ascii_uppercase if self.include_lower_case: candidates += string.ascii_lowercase if self.include_numeric: candidates += string.digits if self.spec_char: candidates += self.spec_char return candidates def _get_swap_position(self, pos, token_length, mode='adjacent'): if mode == 'adjacent': if pos == 0: # Force swap with next character if it is first character return pos + 1 elif pos == token_length: # Force swap with previous character if it is last character return pos - 1 else: return pos + self.sample([-1, 1], 1)[0] elif mode == 'middle': # Middle Random: https://arxiv.org/pdf/1711.02173.pdf candidates = [_ for _ in range(token_length) if _ not in [0, pos, token_length]] if len(candidates) == 0: return pos return self.sample(candidates, 1)[0] elif mode == 'random': # Fully Random: https://arxiv.org/pdf/1711.02173.pdf candidates = [_ for _ in range(token_length) if _ not in [pos]] if len(candidates) < 1: return pos return self.sample(candidates, 1)[0]