Source code for mzutils.os_funcs

import codecs
import errno
import os
import shutil
import tarfile
import time
import zipfile
from inspect import getfullargspec

import nltk


[docs]def parent_dir_and_name(file_path):
    """
    >>> file_path="a/b.c"
    >>> parent_dir_and_name(file_path)
    ('/root/.../a', 'b.c')
    :param file_path:
    :return:
    """
    return os.path.split(os.path.abspath(file_path))


[docs]def basename_and_extension(file_path):
    """
    >>> file_path="a/b.c"
    >>> basename_and_extension(file_path)
    ('b', '.c')
    :param file_path:
    :return:
    """
    return os.path.splitext(os.path.basename(file_path))


[docs]def get_things_in_loc(in_path, just_files=True, endswith=None):
    """
    in_path can be file path or dir path.
    This function return a list of file paths
    in in_path if in_path is a dir, or within the
    parent path of in_path if it is not a dir.
    just_files=False will let the function go recursively
    into the subdirs.
    :endswith: None or a list of file extensions (to end with).
    """
    # TODO: check for file
    if not os.path.exists(in_path):
        print(str(in_path) + " does not exists!")
        return
    re_list = []
    if not os.path.isdir(in_path):
        in_path = parent_dir_and_name(in_path)[0]

    for name in os.listdir(in_path):
        name_path = os.path.abspath(os.path.join(in_path, name))
        if os.path.isfile(name_path) and (endswith is None or (True in [name_path.endswith(ext) for ext in endswith])):
            re_list.append(name_path)
        elif not just_files:
            if os.path.isdir(name_path):
                re_list += get_things_in_loc(name_path, just_files=just_files, endswith=endswith)
    return re_list


[docs]def get_checkpoints_in_loc(in_path, keywords=['checkpoint-'], files_or_folders='folders'):
    """
    This function will loop through in_path to find all files/folders that includes all keywords 
    if files_or_folders='files'/'folders'. 
    again, in_path can be file path or dir path.
    The function is meant to grab all checkpoint-XXXX in a folder.
    """
    if not os.path.exists(in_path):
        print(str(in_path) + " does not exists!")
        return
    re_list = []
    if not os.path.isdir(in_path):
        in_path = parent_dir_and_name(in_path)[0]

    for name in os.listdir(in_path):
        name_path = os.path.abspath(os.path.join(in_path, name))
        pattern_truth = all([keyword in name_path for keyword in keywords])
        if pattern_truth:
            if os.path.isfile(name_path) and files_or_folders == 'files':
                re_list.append(name_path)
            elif os.path.isdir(name_path) and files_or_folders == 'folders':
                re_list.append(name_path)
    return re_list


[docs]def clean_dir(dir_path, just_files=True):
    """
    Clean up a directory.
    :param dir_path:
    :param just_files: If just_files=False, also remove all directory trees in that directory.
    :return:
    """
    if not os.path.isdir(dir_path):
        if not os.path.exists(dir_path):
            print(str(dir_path) + " does not exists!")
            return
        if not os.path.isdir(dir_path):
            print(str(dir_path) + " has to be a directory!")
            return
    for name in os.listdir(dir_path):
        name_path = os.path.join(dir_path, name)
        if os.path.isfile(name_path):
            os.remove(name_path)
        elif not just_files:
            if os.path.isdir(name_path):
                shutil.rmtree(name_path)


[docs]def mkdir_p(dir_path):
    """
    mkdir -p functionality in python
    :param dir_path:
    :return:
    """
    try:
        os.makedirs(dir_path, exist_ok=True)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e


[docs]def unzip_all(dir_path, target_path, endswith=".zip"):
    """

    :param dir_path:
    :param target_path:
    :param endswith: ".zip", ".tar.gz" or ".tar"
    :return:
    """
    for item in os.listdir(dir_path):
        if item.endswith(endswith):
            if endswith == ".zip":
                zip_ref = zipfile.ZipFile(os.path.join(dir_path, item), 'r')
            elif endswith == ".tar.gz":
                zip_ref = tarfile.open(os.path.join(dir_path, item), 'r:gz')
            elif endswith == ".tar":
                zip_ref = tarfile.open(os.path.join(dir_path, item), 'r:')
            else:
                continue
            zip_ref.extractall(target_path)
            zip_ref.close()


[docs]def documents_segementor_on_word_length(documents_dir, store_dir, max_length, language='english',
                                        clean_store_dir=False):
    """
    segment a long document to several small documents based on the nltk tokenized word length.
    sentence structure will be kept.
    :param documents_dir: where all documents located.
    :param store_dir: where to store segmented documents.
    :param max_length: document segments' max length.
    :param language: for the use of nltk, default english.
    :param clean_store_dir:
    :return: number of documents after segmented.
    """
    final_num_of_docs = 0
    if not os.path.isdir(documents_dir):
        raise Exception("documents_dir: where all documents located.")
    if not os.path.isdir(store_dir):
        os.mkdir(store_dir)
    if clean_store_dir:
        clean_dir(store_dir, just_files=False)
    names = [name for name in os.listdir(documents_dir) if os.path.isfile(os.path.join(documents_dir, name))]
    for name in names:
        final_num_of_docs += helper_document_segmentor(documents_dir, store_dir, name, max_length, language)
    return final_num_of_docs


# ------------------helper funcs-----------------------------


[docs]def helper_document_segmentor(documents_dir, store_dir, name, max_length, language):
    documents = []
    with codecs.open(os.path.join(documents_dir, name), "r", "utf-8") as fp:
        filecontent = fp.read()
        sentences = nltk.sent_tokenize(filecontent, language)
        i = 0
        word_count = 0
        document = ""
        while i < len(sentences):
            sentence = sentences[i]
            current_count = len(nltk.word_tokenize(sentence, language))
            if current_count >= max_length:
                document = document + sentence + " "
                documents.append(document)
                word_count = 0
                document = ""
                i = i + 1
                print("Warning: " + "there is a sentence with word length " + str(
                    current_count) + " , but the maximum document length is " + str(max_length))
                continue
                # raise Exception("there is a sentence with word length " + str(current_count) + " , but the maximum document length is " + str(max_length))
            if word_count + current_count >= max_length:
                documents.append(document)
                word_count = 0
                document = ""
            else:
                document = document + sentence + " "
                word_count = word_count + current_count
                i = i + 1
        documents.append(document)
    helper_save_documents(store_dir, name, documents)
    return len(documents)


[docs]def helper_save_documents(store_dir, name, documents):
    with codecs.open(os.path.join(store_dir, name), "w+", "utf-8") as fp:
        fp.write(documents[0])
    for document in documents[1:]:
        filepath = helper_check_existance_and_add_timestamp(store_dir, name)
        with codecs.open(filepath, "w+", "utf-8") as fp:
            fp.write(document)


[docs]def helper_check_existance_and_add_timestamp(store_dir, name):
    timestamp = str(int(round(time.time() * 1000)))
    filepath = os.path.join(store_dir, name)
    filename, extension = os.path.splitext(filepath)
    while os.path.exists(filename + extension):
        filename += timestamp
    return filename + extension


[docs]def loop_through_copy_files_to_one_dir(looped_dir, target_dir, include_link=False):
    """
    function to loop through nested directories and copy all the files to a target directory.
    :param looped_dir:
    :param target_dir: a directory string.
    :return:
    """
    if not os.path.isdir(looped_dir):
        raise Exception("looped_dir: a directory.")
    if not os.path.isdir(target_dir):
        raise Exception("target_dir: a directory.")
    for thing in os.listdir(looped_dir):
        thing = os.path.join(looped_dir, thing)
        if os.path.isdir(thing):
            loop_through_copy_files_to_one_dir(thing, target_dir)
        elif os.path.isfile(thing):
            shutil.move(thing, os.path.join(target_dir, parent_dir_and_name(thing)[1]))
        elif include_link:
            shutil.move(thing, os.path.join(target_dir, parent_dir_and_name(thing)[1]))
    return


[docs]def loop_through_return_abs_file_path(looped_dir):
    """
    function to loop through nested directories and return file absolute path in a list.
    :param looped_dir:
    :return: list
    """
    re_list = []
    if not os.path.isdir(looped_dir):
        raise Exception("looped_dir: a directory.")
    for thing in os.listdir(looped_dir):
        thing = os.path.join(looped_dir, thing)
        thing = os.path.abspath(thing)
        if os.path.isdir(thing):
            re_list = re_list + loop_through_return_abs_file_path(thing)
        elif os.path.isfile(thing):
            re_list.append(thing)
    return re_list


[docs]def loop_through_store_files_to_list(looped_dir, encoding="utf-8"):
    """
    function to loop through nested directories and store the content of all files into a list separately.
    This function does not care about symbolic link inside the nested directories.
    :param looped_dir:
    :param encoding:
    :return: list
    """
    re_list = []
    if not os.path.isdir(looped_dir):
        raise Exception("looped_dir: a directory.")
    for thing in os.listdir(looped_dir):
        thing = os.path.join(looped_dir, thing)
        if os.path.isdir(thing):
            re_list = re_list + loop_through_store_files_to_list(thing, encoding)
        elif os.path.isfile(thing):
            with codecs.open(thing, 'r', encoding) as fp:
                filecontent = fp.read()
                re_list.append(filecontent)
    return re_list


[docs]def loop_through_store_lines_to_list(looped_dir, encoding="utf-8"):
    """
    function to loop through nested directories and store the lines of all files into a list.
    This function does not care about symbolic link inside the nested directories.
    :param looped_dir:
    :param encoding:
    :return: list
    """
    re_list = []
    if not os.path.isdir(looped_dir):
        raise Exception("looped_dir: a directory.")
    for thing in os.listdir(looped_dir):
        thing = os.path.join(looped_dir, thing)
        if os.path.isdir(thing):
            re_list = re_list + loop_through_store_files_to_list(thing, encoding)
        elif os.path.isfile(thing):
            with codecs.open(thing, 'r', encoding) as fp:
                filecontent = fp.readlines()
                re_list += filecontent
    return re_list


[docs]def save__init__args(values, underscore=False, overwrite=False, subclass_only=False):
    """
    Use in `__init__()` only; assign all args/kwargs to instance attributes.
    To maintain precedence of args provided to subclasses, call this in the
    subclass before `super().__init__()` if `save__init__args()` also appears
    in base class, or use `overwrite=True`.  With `subclass_only==True`, only
    args/kwargs listed in current subclass apply.
    usage:
    >>> class AgentModel:
    ...     def __init__(
    ...             self,
    ...             meta_info_attr_size=7,
    ...             obs_shape=(3, 64, 64),
    ...             reward_shape=(1,),
    ...             n_agents=1,
    ...             obs_last_action=False,
    ...             obs_agent_id=True,
    ...             rnn_hidden_dim=64,
    ...             based_on='observation',
    ...             n_actions=11,
    ...             use_cuda=True,):
    ...         save__init__args(locals())
    >>> a=AgentModel()
    >>> a.rnn_hidden_dim
    >>> 64
    """
    prefix = "_" if underscore else ""
    self = values['self']
    args = list()
    Classes = type(self).mro()
    if subclass_only:
        Classes = Classes[:1]
    for Cls in Classes:  # class inheritances
        if '__init__' in vars(Cls):
            args += getfullargspec(Cls.__init__).args[1:]
    for arg in args:
        attr = prefix + arg
        if arg in values and (not hasattr(self, attr) or overwrite):
            setattr(self, attr, values[arg])


[docs]def set_local_vars_from_yaml(yaml_loc, name_space_dict):
    """
    set local variables from yaml file.
    :param yaml_loc: your yaml file location.
    :param name_space_dict: a dictionary that contains the local variables. e.g. locals()
    :return: None
    for example, if your yaml file contains a variable called num_workers and the value of 
    which is an integer 4, then
    >>> set_local_vars_from_yaml('path_to_file.yaml', locals())
    >>> num_workers
    4
    """
    import yaml
    with open(yaml_loc, 'r') as fp:
        config_dict = yaml.safe_load(fp)
    name_space_dict.update(config_dict)


[docs]class TimeRecorder:
    def __init__(self):
        """
        need to import time.
        """
        self.init_time = time.time()
        self.base_time = self.init_time
        self.times = []

[docs]    def record(self, name=""):
        current = time.time()
        self.times.append((name, current - self.base_time))
        self.base_time = current

[docs]    def get_times(self):
        return self.times

[docs]    def get_times_str(self):
        return str(self.get_times())