Source code for nlpaug.augmenter.word.antonym

"""
    Augmenter that apply semantic meaning based to textual input.
"""

from nlpaug.augmenter.word import WordAugmenter
from nlpaug.util import Action, Doc, PartOfSpeech, WarningException, WarningName, WarningCode, WarningMessage
import nlpaug.model.word_dict as nmw


[docs]class AntonymAug(WordAugmenter):
    # https://arxiv.org/pdf/1809.02079.pdf
    """
    Augmenter that leverage semantic meaning to substitute word.

    :param str lang: Language of your text. Default value is 'eng'.
    :param float aug_p: Percentage of word will be augmented.
    :param int aug_min: Minimum number of word will be augmented.
    :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
        calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
        aug_p. Otherwise, using aug_max.
    :param list stopwords: List of words which will be skipped from augment operation.
    :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
    :param func tokenizer: Customize tokenization process
    :param func reverse_tokenizer: Customize reverse of tokenization process
    :param str name: Name of this augmenter

    >>> import nlpaug.augmenter.word as naw
    >>> aug = naw.AntonymAug()
    """

    def __init__(self, name='Antonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng',
                 stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, 
                 verbose=0):
        super().__init__(
            action=Action.SUBSTITUTE, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords,
            tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose,
            stopwords_regex=stopwords_regex, include_detail=False)

        self.aug_src = 'wordnet'  # TODO: other source
        self.lang = lang
        self.model = self.get_model(self.aug_src, lang)

    def skip_aug(self, token_idxes, tokens):
        results = []
        for token_idx in token_idxes:
            # Based on https://arxiv.org/pdf/1809.02079.pdf for Antonyms,
            # We choose only tokens which are Verbs, Adjectives, Adverbs
            if tokens[token_idx][1] not in ['VB', 'VBD', 'VBZ', 'VBG', 'VBN', 'VBP',
                'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
                continue

            # Check having antonym or not.
            # TODO: do it again in later phase. 
            if len(self.get_candidates(tokens, token_idx)) == 0:
                continue
            
            results.append(token_idx)

        return results

    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens, tuple_idx=0)
        word_idxes = self.skip_aug(word_idxes, tokens)
        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(name=WarningName.OUT_OF_VOCABULARY,
                                             code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
                exception.output()
            return None

        aug_idexes = []
        for aug_idx in word_idxes:
            word_poses = PartOfSpeech.constituent2pos(tokens[aug_idx][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(tokens[aug_idx][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(self.model.predict(tokens[aug_idx][0], pos=word_pos))

            candidates = [c for c in candidates if c.lower() != tokens[aug_idx][0].lower()]

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                aug_idexes.append((aug_idx, candidate))

        if len(aug_idexes) < aug_cnt:
            aug_cnt = len(aug_idexes)

        aug_idexes = self.sample(aug_idexes, aug_cnt)
        return aug_idexes

    def get_candidates(self, tokens, token_idx):
        original_token = tokens[token_idx][0]
        word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1])
        candidates = []
        if word_poses is None or len(word_poses) == 0:
            # Use every possible words as the mapping does not defined correctly
            candidates.extend(self.model.predict(tokens[token_idx][0]))
        else:
            for word_pos in word_poses:
                candidates.extend(self.model.predict(tokens[token_idx][0], pos=word_pos))

        candidates = [c for c in candidates if c.lower() != original_token.lower()]
        return candidates

    def substitute(self, data):
        if not data or not data.strip():
            return data
            
        change_seq = 0
        doc = Doc(data, self.tokenizer(data))

        pos = self.model.pos_tag(doc.get_original_tokens())

        aug_candidates = self._get_aug_idxes(pos)
        if aug_candidates is None or len(aug_candidates) == 0:
            if self.include_detail:
                return data, []
            return data

        aug_idxes, candidates = zip(*aug_candidates)
        if aug_idxes is None or len(aug_idxes) == 0:
            if self.include_detail:
                return data, []
            return data

        for aug_idx, original_token in enumerate(doc.get_original_tokens()):
            # Skip if no augment for word
            if aug_idx not in aug_idxes:
                continue
            
            candidates = self.get_candidates(pos, aug_idx)

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                candidate = candidate.replace("_", " ").replace("-", " ").lower()
                substitute_token = self.align_capitalization(original_token, candidate)
                
                if aug_idx == 0:
                    substitute_token = self.align_capitalization(original_token, substitute_token)

                change_seq += 1
                doc.add_change_log(aug_idx, new_token=substitute_token, action=Action.SUBSTITUTE,
                                   change_seq=self.parent_change_seq + change_seq)

        if self.include_detail:
            return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
        else:
            return self.reverse_tokenizer(doc.get_augmented_tokens())

    @classmethod
    def get_model(cls, aug_src, lang):
        if aug_src == 'wordnet':
            return nmw.WordNet(lang=lang, is_synonym=False)