Source code for nlpaug.util.file.download

import gzip
import os
import shutil
import tarfile
import urllib
import zipfile

import gdown
import requests


[docs]class DownloadUtil:
    """
    Helper function for downloading external dependency

    >>> from nlpaug.util.file.download import DownloadUtil
    """

[docs]    @staticmethod
    def download_word2vec(dest_dir: str = "."):
        """
        :param str dest_dir: Directory of saving file
        :return: Word2Vec C binary file named 'GoogleNews-vectors-negative300.bin'

        >>> DownloadUtil.download_word2vec('.')

        """
        file_path = DownloadUtil.download_from_google_drive(
            url="https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM",
            dest_dir=dest_dir,
            dest_file="GoogleNews-vectors-negative300.bin.gz",
        )
        DownloadUtil.unzip(file_path, dest_dir=dest_dir)

[docs]    @staticmethod
    def download_glove(model_name, dest_dir):
        """
        :param str model_name: GloVe pre-trained model name. Possible values are 'glove.6B', 'glove.42B.300d',
            'glove.840B.300d' and 'glove.twitter.27B'
        :param str dest_dir: Directory of saving file

        >>> DownloadUtil.download_glove('glove.6B', '.')

        """

        url = ""
        if model_name == "glove.6B":
            url = "http://nlp.stanford.edu/data/glove.6B.zip"
        elif model_name == "glove.42B.300d":
            url = "http://nlp.stanford.edu/data/glove.42B.300d.zip"
        elif model_name == "glove.840B.300d":
            url = "http://nlp.stanford.edu/data/glove.840B.300d.zip"
        elif model_name == "glove.twitter.27B":
            url = ("http://nlp.stanford.edu/data/glove.twitter.27B.zip",)
        else:
            possible_values = [
                "glove.6B",
                "glove.42B.300d",
                "glove.840B.300d",
                "glove.twitter.27B",
            ]
            raise ValueError(
                "Unknown model_name. Possible values are {}".format(possible_values)
            )

        file_path = DownloadUtil.download(url, dest_dir=dest_dir)
        DownloadUtil.unzip(file_path)

[docs]    @staticmethod
    def download_fasttext(model_name, dest_dir):
        """
        :param str model_name: GloVe pre-trained model name. Possible values are 'wiki-news-300d-1M',
            'wiki-news-300d-1M-subword', 'crawl-300d-2M' and 'crawl-300d-2M-subword'
        :param str dest_dir: Directory of saving file

        >>> DownloadUtil.download_fasttext('glove.6B', '.')

        """

        url = ""
        if model_name == "wiki-news-300d-1M":
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
        elif model_name == "wiki-news-300d-1M-subword":
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip"
        elif model_name == "crawl-300d-2M":
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        elif model_name == "crawl-300d-2M-subword":
            url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip"
        else:
            possible_values = ["wiki-news-300d-1M", "crawl-300d-2M"]
            raise ValueError(
                "Unknown model_name. Possible values are {}".format(possible_values)
            )

        file_path = DownloadUtil.download(url, dest_dir=dest_dir)
        DownloadUtil.unzip(file_path)

    @staticmethod
    def download_back_translation(dest_dir):
        url = "https://storage.googleapis.com/uda_model/text/back_trans_checkpoints.zip"
        file_path = DownloadUtil.download(url, dest_dir=dest_dir)
        DownloadUtil.unzip(file_path)

    @staticmethod
    def download(src, dest_dir, dest_file=None):
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        if dest_file is None:
            dest_file = os.path.basename(src)

        if not os.path.exists(dest_dir + dest_file):
            req = urllib.request.Request(src)
            file = urllib.request.urlopen(req)
            with open(os.path.join(dest_dir, dest_file), "wb") as output:
                output.write(file.read())
        return os.path.join(dest_dir, dest_file)

[docs]    @staticmethod
    def unzip(file_path, dest_dir=None):
        """
        :param str file_path: File path for unzip

        >>> DownloadUtil.unzip('zip_file.zip')

        """

        if dest_dir is None:
            dest_dir = os.path.dirname(file_path)

        if file_path.endswith(".zip"):
            with zipfile.ZipFile(file_path, "r") as zip_ref:
                zip_ref.extractall(dest_dir)
        elif file_path.endswith("tar.gz") or file_path.endswith("tgz"):
            tar = tarfile.open(file_path, "r:gz")
            tar.extractall(dest_dir)
            tar.close()
        elif file_path.endswith("tar"):
            tar = tarfile.open(file_path, "r:")
            tar.extractall(dest_dir)
            tar.close()
        elif file_path.endswith("bin.gz"):
            with gzip.open(file_path, "rb") as f_in:
                with open(file_path.replace(".gz", ""), "wb") as f_out:
                    shutil.copyfileobj(f_in, f_out)

    @staticmethod
    def download_from_google_drive(
        url: str = "https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM",
        dest_dir: str = ".",
        dest_file: str = "/tmp/nlpaug_model.zip",
    ) -> str:
        return gdown.download(url, output=f"{dest_dir}/{dest_file}", quiet=False)