"""
Augmenter that apply typo error simulation to textual input.
"""
import os
from nlpaug.augmenter.char import CharAugmenter
from nlpaug.util import Action, Method, Doc, LibraryUtil
import nlpaug.model.char as nmc
[docs]class KeyboardAug(CharAugmenter):
# https://arxiv.org/pdf/1711.02173.pdf
"""
Augmenter that simulate typo error by random values. For example, people may type i as o incorrectly.\
One keyboard distance is leveraged to replace character by possible keyboard error.
:param float aug_char_p: Percentage of character (per token) will be augmented.
:param int aug_char_min: Minimum number of character will be augmented.
:param int aug_char_max: Maximum number of character will be augmented. If None is passed, number of augmentation is
calculated via aup_char_p. If calculated result from aug_char_p is smaller than aug_char_max, will use calculated result
from aup_char_p. Otherwise, using aug_max.
:param float aug_word_p: Percentage of word will be augmented.
:param int aug_word_min: Minimum number of word will be augmented.
:param int aug_word_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
calculated via aup_word_p. If calculated result from aug_word_p is smaller than aug_word_max, will use calculated result
from aug_word_p. Otherwise, using aug_max.
:param list stopwords: List of words which will be skipped from augment operation.
:param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
:param func tokenizer: Customize tokenization process
:param func reverse_tokenizer: Customize reverse of tokenization process
:param bool include_special_char: Include special character
:param bool include_upper_case: If True, upper case character may be included in augmented data.
:param bool include_numeric: If True, numeric character may be included in augmented data.
:param int min_char: If word less than this value, do not draw word for augmentation
:param str model_path: Loading customize model from file system
:param str lang: Indicate built-in language model. Default value is 'en'. Possible values are 'en', 'th' (Thai),
'tr'(Turkish), 'de'(German), 'es'(Spanish), 'fr'(French), 'it'(Italian), 'nl'(Dutch), 'pl'(Polish),
'uk'(Ukrainian), 'he'(Hebrew). If custom model is used (passing model_path), this value will be ignored.
:param str name: Name of this augmenter
>>> import nlpaug.augmenter.char as nac
>>> aug = nac.KeyboardAug()
"""
def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
tokenizer=None, reverse_tokenizer=None, include_special_char=True, include_numeric=True,
include_upper_case=True, lang="en", verbose=0, stopwords_regex=None, model_path=None,
min_char=4):
super().__init__(
action=Action.SUBSTITUTE, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=include_special_char,
include_detail=False)
# TODO: support other type of keyboard
self.keyboard_type = 'qwerty'
self.include_special_char = include_special_char
self.include_numeric = include_numeric
self.include_upper_case = include_upper_case
self.include_lower_case = True
self.lang = lang
if model_path is None:
lang_list = set(
map(
lambda file_name: file_name.replace(".json", ""),
os.listdir(
os.path.join(LibraryUtil.get_res_dir(), "char", "keyboard")
),
)
)
if lang not in lang_list:
raise ValueError(
"Only support en and th now. You may provide the keyboard mapping "
'such that we can support "{}"'.format(lang)
)
self.model_path = os.path.join(
LibraryUtil.get_res_dir(), "char", "keyboard", lang + ".json"
)
else:
self.model_path = model_path
self.model = self.get_model(include_special_char, include_numeric, include_upper_case, lang, self.model_path)
def skip_aug(self, token_idxes, tokens):
results = []
for token_idx in token_idxes:
char = tokens[token_idx]
if char in self.model.model and len(self.model.predict(char)) > 0:
results.append(token_idx)
return results
def substitute(self, data):
if not data or not data.strip():
return data
change_seq = 0
doc = Doc(data, self.tokenizer(data))
aug_word_idxes = self._get_aug_idxes(doc.get_original_tokens(), self.aug_word_min,
self.aug_word_max, self.aug_word_p, Method.WORD)
for token_i, token in enumerate(doc.get_original_tokens()):
if token_i not in aug_word_idxes:
continue
new_token = ''
chars = self.token2char(token)
aug_char_idxes = self._get_aug_idxes(chars, self.aug_char_min, self.aug_char_max,
self.aug_char_p, Method.CHAR)
if aug_char_idxes is None:
continue
for char_i, char in enumerate(chars):
if char_i not in aug_char_idxes:
new_token += char
continue
new_token += self.sample(self.model.predict(chars[char_i]), 1)[0]
# No capitalization alignment as this augmenter try to simulate typo
change_seq += 1
doc.add_change_log(token_i, new_token=new_token, action=Action.SUBSTITUTE,
change_seq=self.parent_change_seq+change_seq)
if self.include_detail:
return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
else:
return self.reverse_tokenizer(doc.get_augmented_tokens())
@classmethod
def get_model(cls, special_char=True, numeric=True, upper_case=True, lang="en", model_path=None):
return nmc.Keyboard(special_char=special_char, numeric=numeric, upper_case=upper_case, lang=lang,
model_path=model_path)