Source code for batchflow.research.research

#pylint:disable=logging-fstring-interpolation, too-many-arguments
""" Research class for muliple parallel experiments. """

import time
import itertools
import warnings
import psutil
import multiprocess as mp
import tqdm

from .domain import Domain
from .distributor import Distributor, DynamicQueue
from .experiment import Experiment, Executor
from ..utils import to_list
from .storage import BaseResearchStorage, LocalResearchStorage, MemoryResearchStorage

from ..utils_random import make_seed_sequence

[docs]class Research:
    """ Research is an instrument to run multiple parallel experiments with different combinations of
    parameters called experiment configs. Configs are produced by :class:`domain.Domain` (some kind of
    parameters grid.)

    Parameters
    ----------
    name : str, optional
        name (relative path) of the research and corresponding folder to store results, by default 'research'.
    domain : Domain, optional
        grid of parameters (see :class:`domain.Domain`) to produce experiment configs, by default None.
    experiment : Experiment, optional
        description of the experiment (see :class:`experiment.Experiment`), by default None. Experiment can be
        defined explicitly as a parameter or constructed by Research methods (`:meth:.add_callable`,
        `:meth:.add_generator`, etc.).
    n_configs : int, optional
        the number of configs to get from domain (see `n_items` of :meth:`domain.Domain.set_iter_params`),
        by default None.
    n_reps : int, optional
        the number of repetitions for each config (see `n_reps` of :meth:`domain.Domain.set_iter_params`), by default 1.
    repeat_each : int, optional
        see `repeat_each` of :meth:`domain.Domain.set_iter_params`, by default 100.
    """
    def __init__(self, name='research', domain=None, experiment=None, n_configs=None, n_reps=1, repeat_each=None):
        self.name = name
        self.domain = Domain(domain)
        self.experiment = experiment or Experiment()
        self.n_configs = n_configs
        self.n_reps = n_reps
        self.repeat_each = repeat_each
        self.create_id_prefix = False
        self.redirect_stdout = True
        self.redirect_stderr = True
        self.storage = None

        self._env = {} # current state of git repo and other environment information.

        self.workers = 1
        self.branches = 1
        self.n_iters = None
        self.devices = None
        self.executor_class = Executor
        self.dump_results = True
        self.parallel = True
        self.executor_target = 'threads'
        self.loglevel = 'info'
        self.bar = True
        self.detach = False
        self.tasks_queue = None
        self.distributor = None
        self.monitor = None
        self.logger = None
        self.process = None
        self.debug = False
        self.finalize = True
        self.random_seed = None
        self.profile = False
        self.memory_ratio = None
        self.n_gpu_checks = 3
        self.gpu_check_delay = 5
        self.dump_monitor = True

        self.is_loaded = False
        self._is_executed = False
        self._env_meta_to_collect = []

    def __getattr__(self, key):
        if self.monitor is not None and key in self.monitor.SHARED_VARIABLES:
            return getattr(self.monitor, key)

        def _method(*args, **kwargs):
            getattr(self.experiment, key)(*args, **kwargs)
            return self
        _method.__doc__ = getattr(self.experiment, key).__doc__
        return _method

    def __getstate__(self):
        return self.__dict__

    def __setstate__(self, d):
        self.__dict__.update(d)

[docs]    def update_domain(self, function, when, **kwargs):
        """ Add domain update functions or update parameters.

        Parameters
        ----------
        function : callable or None
            function to update domain, returns new domain or None (means not to update).
        when : int, str or list, optional
            iterations to update (see `when` of `:class:ExecutableUnit`), by default 1.
        kwargs : dict
            update function parameters.
        """
        self.domain.set_update(function, when, **kwargs)
        return self

[docs]    def attach_env_meta(self):
        """ Get version of packages (by "pip list" and "conda list") and python version. Results will be stored
        in research folder (if it is created) or in _env attribute.
        """
        commands = {
            'pip': 'pip list',
            'conda': 'conda list',
            'python': '#python'
        }
        self._env_meta_to_collect.append(dict(commands=commands, cwd='.', dst=''))
        return self

[docs]    def attach_git_meta(self, cwd='.'):
        """ Get git repo state (current commit, diff and status). Results will be stored
        in research folder (if it is created) or in _env attribute.

        Parameters
        ----------
        cwd : str, optional
            path to repo, by default '.'
        """
        commands = {
            'commit': "git log -1",
            'diff': 'git diff',
            'status': 'git status -uno',
        }
        replace = {'"image/png": ".*?"': '"image/png": "..."'}
        self._env_meta_to_collect.append(dict(cwd=cwd, dst=None, replace=replace, commands=commands))
        return self

[docs]    def get_devices(self, devices):
        """ Return list if lists. Each sublist consists of devices for each branch.

        Parameters
        ----------
        devices : int, str, None or list of them
            devices to split between workers and branches. (see Example below)
        Returns
        -------
        list of lists of lists
            The first nesting level corresponds to workers.
            The second to branches.
            The third is a list of devices for current branch.
            For example, worker with index 2 and its branch with index 3 will get list of devices `devices[2][3]`.

        Examples
        --------
        For 3 workers and 2 branches::

            None -> [[[None], [None]], [[None], [None]], [[None], [None]]]
            1 -> [[['1'], ['1']], [['1'], ['1']], [['1'], ['1']]]
            [1, 2] -> [[['1'], ['1']], [['1'], ['2']], [['2'], ['2']]]
            [1, 2, 3, 4, 5] -> [[['1'], ['2']], [['3'], ['4']], [['5'], ['1']]]
            [0, 1, ..., 12] -> [[['0', '1'], ['2', '3']],
                                [['4', '5'], ['6', '7']],
                                [['8', '9'], ['10', '11']]]

        """
        n_branches = self.branches if isinstance(self.branches, int) else len(self.branches)
        n_workers = self.workers if isinstance(self.workers, int) else len(self.workers)
        total_n_branches = n_workers * n_branches
        if devices is None:
            devices = [[[None]] * n_branches] * n_workers
        if isinstance(devices, (int, str)):
            devices = [devices]
        if isinstance(devices[0], (int, str)):
            if total_n_branches > len(devices):
                _devices = list(itertools.chain.from_iterable(
                    zip(*itertools.repeat(devices, total_n_branches // len(devices)))
                ))
                devices = _devices + devices[:total_n_branches % len(devices)]
            else:
                devices = devices + devices[:-len(devices) % (total_n_branches)]
            if total_n_branches % len(devices) == 0:
                branches_per_device = total_n_branches // len(devices)
                devices = list(itertools.chain.from_iterable(itertools.repeat(x, branches_per_device) for x in devices))
            if len(devices) % total_n_branches == 0:
                devices_per_branch = len(devices) // total_n_branches
                devices = [
                    [
                        [
                            devices[n_branches * devices_per_branch * i + devices_per_branch * j + k]
                            for k in range(devices_per_branch)
                        ] for j in range(n_branches)
                    ] for i in range(n_workers)
                ]
        if isinstance(devices[0], list):
            def _transform_item(x):
                x = to_list(x)
                values = [str(item) if isinstance(item, int) else item for item in x]
                return values if x is not None else []

            devices = [[_transform_item(branch_config) for branch_config in worker_config] for worker_config in devices]
        return devices


[docs]    def run(self, name=None, workers=1, branches=1, n_iters=None, devices=None, executor_class=Executor,
            dump_results=True, parallel=True, executor_target='threads', loglevel=None, bar=True, detach=False,
            debug=False, finalize=True, git_meta=False, env_meta=False, seed=None, profile=False,
            memory_ratio=None, n_gpu_checks=3, gpu_check_delay=5, create_id_prefix=False,
            redirect_stdout=True, redirect_stderr=True):
        """ Run research.

        Parameters
        ----------
        name : str, optional
            redefine name of the research (if needed), by default None.
        workers : int or list of Config instances, optional
            number of parallel workers, by default 1. If int, number of parallel workers to execute experiments.
            If list of Configs, list of configs for each worker which will be appended to configs from domain. Each
            element corresponds to one worker.
        branches : int or list of Config instances, optional
            number of different branches with different configs with the same root, by default 1.
            If list of Configs, list of configs for each branch which will be appended to configs from domain. Each
            element corresponds to one branch.
        n_iters : int, optional
            number of experiment iterations, by default None, None means that experiment will be executed until
            StopIteration exception.
        devices : str or list, optional
            devices to split between workers and branches, by default None.
        executor_class : Executor-inherited class, optional
            executor for experiments, by default None (means that Executor will be used).
        dump_results : bool, optional
            dump results or not, by default True.
        parallel : bool, optional
            execute experiments in parallel in separate processes or not, by default True.
        executor_target : 'for' or 'threads', optional
            how to execute branches, by default 'threads'.
        loglevel : str, optional
            logging level, by default 'debug'.
        bar : bool or class
            use or not progress bar.
        detach : bool, optional
            run research in separate process or not, by default False.
        debug : bool, optional
            If False, continue research after exceptions. If True, raise Exception. Can be used only with
            `parallel=False` and `executor_target='for'`, by default False.
        finalize : bool, optional
            continue experiment iteration after exception in some unit or not, by default True.
        git_meta : bool, optional
            attach get repo state or not (see :meth:`.Research.attach_git_meta`).
        env_meta : bool, optional
            attach env meta or not (see :meth:`.Research.attach_env_meta`).
        seed : bool or int or object with a seed sequence attribute
            see :meth:`~batchflow.utils_random.make_seed_sequence`.
        profile : bool, optional
            perform Research profiling or not, be default False.
        memory_ratio : float or None, optional
            the ratio of free memory for all devices in worker to start experiment. If None, check will be skipped.
        n_gpu_checks : int, optional
            the number of such checks
        gpu_check_delay : float, optional
            time in seconds between checks.
        create_id_prefix : bool or int, optional
            add prefix to experiment id to allow to sort them by the order of parameters in domain. If int,
            the number of digits for the parameter code formatting.
        redirect_stdout, redirect_stderr : int or bool, optional
            how to redirect stdout/stderr to files:
                0 or False - no redirection,
                True - redirect to common research file "stdout.txt"/"stderr.txt" when `dump_results=True`
                       or to separate items in `research.storage.experiments_stdout` when `dump_results=False`
                1 - redirect to common research file "stdout.txt"/"stderr.txt" (only when dump_results=True)
                2 - redirect output streams of experiments into separate file in experiments folders
                3 - redirect to common file and to separate experiments files (only when dump_results=True)

        Returns
        -------
        Research instance

        **How does it work**

        At each iteration all units of the experiment will be executed in the order in which were added.
        If `update_domain` callable is defined, domain will be updated with the corresponding function
        accordingly to `when` parameter of :meth:`~.Research.update_domain`.
        """
        self.name = name or self.name

        self.workers = workers
        self.branches = branches
        self.devices = self.get_devices(devices)
        self.executor_class = executor_class
        self.dump_results = dump_results
        self.parallel = parallel
        self.executor_target = executor_target
        self.loglevel = loglevel
        self.bar = bar
        self.detach = detach
        self.profile = profile

        self.memory_ratio = memory_ratio
        self.n_gpu_checks = n_gpu_checks
        self.gpu_check_delay = gpu_check_delay
        self.create_id_prefix = create_id_prefix

        if redirect_stdout is True and not dump_results:
            redirect_stdout = 2

        if redirect_stderr is True and not dump_results:
            redirect_stderr = 2

        self.redirect_stdout = redirect_stdout
        self.redirect_stderr = redirect_stderr

        if debug and (parallel or executor_target not in ['f', 'for']):
            raise ValueError("`debug` can be True only with `parallel=False` and `executor_target='for'`")

        if not dump_results and redirect_stdout in (1, 3):
            raise ValueError("`redirect_stdout` can be 0 or 2 only when `dump_results` is False")

        if not dump_results and redirect_stderr in (1, 3):
            raise ValueError("`redirect_stderr` can be 0 or 2 only when `dump_results` is False")

        self.debug = debug
        self.finalize = finalize
        self.random_seed = make_seed_sequence(seed)

        if n_iters is None and self.experiment.only_callables:
            self.n_iters = 1
        else:
            self.n_iters = n_iters

        self.domain.set_iter_params(n_items=self.n_configs, n_reps=self.n_reps, repeat_each=self.repeat_each,
                                    create_id_prefix=self.create_id_prefix, seed=self.random_seed)

        if self.domain.size is None and (self.domain.update_func is None or self.domain.update_each == 'last'):
            warnings.warn("Research will be infinite because has infinite domain and hasn't domain updating",
                          stacklevel=2)

        self.storage = self.dump_results
        if isinstance(self.storage, bool):
            self.storage = 'local' if self.storage else 'memory'
        if isinstance(self.storage, str):
            self.storage = BaseResearchStorage(self, self.loglevel, storage=self.storage)

        if not isinstance(self.storage, MemoryResearchStorage):
            self.experiment = self.experiment.dump() # add final dump of experiment results

        self.logger = self.storage.logger

        if git_meta:
            self.attach_git_meta()
        if env_meta:
            self.attach_env_meta()

        self.storage.collect_env_state(self._env_meta_to_collect)

        n_branches = self.branches if isinstance(self.branches, int) else len(self.branches)
        self.tasks_queue = DynamicQueue(self.domain, self, n_branches)
        self.distributor = Distributor(self.tasks_queue, self)

        self.monitor = ResearchMonitor(self, bar=self.bar) # process execution signals

        def _start_distributor():
            self.distributor.run()
            self.monitor.stop()

        self.logger.info("Research is starting")

        self.monitor.start()
        if self.parallel:
            try:
                self.process = mp.Process(target=_start_distributor)
                self.process.start()
                self.logger.info(f"Create separate research process [pid:{self.process.pid}]")
                self.monitor.detach(self.process)
                if not detach:
                    self.process.join()
                    self.terminate()
            except KeyboardInterrupt as e:
                self.logger.info("Research has been stopped by KeyboardInterrupt.")
                self.terminate(force=True, wait=False)
                raise e
        else:
            if detach:
                warnings.warn("detach can't be enabled when parallel=False")
            _start_distributor()
            self.terminate()
        return self

    @property
    def results(self):
        return self.storage.results

    @property
    def profiler(self):
        return self.storage.profiler

[docs]    def terminate(self, kill_processes=False, force=False, wait=True):
        """ Kill all research processes. """
        if not self.is_loaded:
            if not force and self.monitor and self.monitor.in_progress:
                answer = input(f'{self.name} is in progress. Are you sure? [y/n]').lower()
                answer = len(answer) > 0 and 'yes'.startswith(answer)
            else:
                answer = True

            if force or answer:
                if self.logger:
                    self.logger.info("Stop research.")
                if self.monitor:
                    self.monitor.stop(wait=wait)
                    self.monitor.close()
                if self.storage:
                    self.storage.close()

                if self.detach:
                    kill_processes = True

                if kill_processes and self.monitor:
                    if self.logger:
                        self.logger.info("Terminate research processes")

                    order = {'EXECUTOR': 1, 'WORKER': 2, 'DETACHED_PROCESS': 3, 'MONITOR': 4}
                    processes_to_kill = sorted(self.monitor.processes.items(), key=lambda x: order[x[1]])
                    for pid, process_type in processes_to_kill:
                        if pid is not None and psutil.pid_exists(pid):
                            process = psutil.Process(pid)
                            process.terminate()
                            if self.logger:
                                self.logger.info(f"Terminate {process_type} [pid:{pid}]")

    @property
    def is_finished(self):
        """ Whether all tasks are completed or not. """
        return self.monitor.in_queue + self.monitor.remained_experiments == 0

    def __str__(self):
        spacing = ' ' * 4
        repr = ''

        params = ['name', 'workers', 'branches', 'n_iters', 'devices', 'dump_results',
                  'parallel', 'loglevel', 'executor_target', 'executor_class']
        params_repr = []
        for param in params:
            params_repr += [f"{param}: {getattr(self, param, None)}"]
        params_repr = '\n'.join(params_repr)

        items = {'params': params_repr, 'experiment': str(self.experiment), 'domain': str(self.domain)}
        for name, items_ in items.items():
            repr += f"{name}:\n"
            repr += '\n'.join([spacing + item for item in str(items_).split('\n')])
            repr += 2 * '\n'

        return repr

[docs]    @classmethod
    def load(cls, name):
        """ Load research. """
        storage = LocalResearchStorage(name, loglevel='info', mode='r')
        return storage.research

[docs]    @classmethod
    def remove(cls, name, ask=True, force=False):
        LocalResearchStorage.remove(name, ask, force)

    def __del__(self):
        self.terminate(force=True)

class ResearchMonitor:
    #pylint:disable=attribute-defined-outside-init
    """ Class to get signals from experiment and other objects and store all states.

    Parameters
    ----------
    research : Research
        Research object
    path : str, optional
        path to save signals, by default None
    bar : bool or class
        use progress bar or not.
    """
    COLUMNS = ['time', 'task_idx', 'id', 'it', 'name', 'status', 'exception', 'worker', 'pid', 'worker_pid',
               'process_pid', 'finished', 'withdrawn', 'remains']
    SHARED_VARIABLES = ['finished_experiments', 'finished_iterations', 'remained_experiments',
                        'generated_experiments', 'stopped']

    def __init__(self, research, bar=True):
        self.queue = mp.JoinableQueue()
        self.stop_signal = mp.JoinableQueue()
        self._manager = mp.Manager()
        self.exceptions = self._manager.list()
        self.shared_values = self._manager.dict()
        self.current_iterations = self._manager.dict()
        self.processes = self._manager.dict({self._manager._process.pid: "MANAGER"})

        self.research = research
        self.bar = tqdm.tqdm(disable=(not bar), position=0, leave=True) if isinstance(bar, bool) else bar

        for key in self.SHARED_VARIABLES:
            self.shared_values[key] = 0

        self.n_iters = self.research.n_iters

        self.dump = False
        self.process = None
        self.stopped = True

    def __getattr__(self, key):
        if key in self.SHARED_VARIABLES:
            return self.shared_values[key]
        raise AttributeError(f'Unknown attribute: {key}')

    def __setattr__(self, key, value):
        if key in self.SHARED_VARIABLES:
            self.shared_values[key] = value
        else:
            super().__setattr__(key, value)

    @property
    def total(self):
        """ Total number of iterations or experiments in the current moment. It changes after domain updates. """
        if self.n_iters:
            return self.finished_iterations + self.n_iters * (self.in_queue + self.remained_experiments)
        return self.finished_experiments + self.in_queue + self.remained_experiments

    @property
    def n(self):
        """ Current iteration. """
        if self.n_iters:
            return self.finished_iterations + sum(self.current_iterations.values())
        return self.finished_experiments + len(self.current_iterations)

    @property
    def in_progress(self):
        """ The number of experiments in progress. """
        return len(self.current_iterations)

    @property
    def in_queue(self):
        """ The number of experiments in queue of tasks. """
        return self.generated_experiments - self.finished_experiments

    def detach(self, process):
        self.processes[process.pid] = 'DETACHED_PROCESS'

    def start_worker(self, worker):
        self.processes[worker.pid] = 'WORKER'

    def start_experiment(self, experiment):
        """" Signal when experiment starts. """
        self.processes[experiment.executor.pid] = 'EXECUTOR'
        self.current_iterations[experiment.id] = 0

    def tasks_info(self, generated, remains):
        self.generated_experiments = generated
        self.remained_experiments = remains

    def stop_experiment(self, experiment):
        """" Signal when experiment stops. """
        self.current_iterations.pop(experiment.id)
        self.finished_iterations += experiment.iteration + 1
        self.finished_experiments += 1

    def execute_iteration(self, experiment):
        """" Signal for iteration execution. """
        self.current_iterations[experiment.id] = experiment.iteration

    def fail_item_execution(self, name, experiment, msg):
        """" Signal for iteration execution fail. """
        self.exceptions.append({
            'id': experiment.id,
            'pid': experiment.executor.pid,
            'name': name,
            'it': experiment.iteration,
            'exception': msg
        })

    def fail_worker_execution(self, worker, msg):
        self.exceptions.append({
            'index': worker.index,
            'pid': worker.pid,
            'exception': msg
        })

    def handler(self):
        """ Signals handler. """
        with self.bar as progress:
            last_update = False
            exceptions = 0
            while True:
                if (progress.n != self.n) or (progress.total != self.total) or (len(self.exceptions) != exceptions):
                    if len(self.exceptions) != exceptions:
                        exceptions = len(self.exceptions)
                        progress.set_description_str(f"Exceptions: {exceptions}")
                    progress.n = self.n
                    progress.total = self.total
                    progress.refresh()
                if last_update:
                    break
                time.sleep(0.01)
                if not self.queue.empty():
                    last_update = True
        self.stop_signal.put(None)

    def start(self):
        """ Start handler. """
        if self.stopped:
            self.process = mp.Process(target=self.handler)
            self.process.start()
            self.processes[self.process.pid] = 'MONITOR'
            self.stopped = False

    def stop(self, wait=True):
        """ Stop handler. """
        if not self.stopped:
            self.queue.put(None)
            if wait:
                self.stop_signal.get()
            self.stopped = True
        tqdm.tqdm._instances.clear() #pylint:disable=protected-access

    def close(self):
        """ Close manager. """
        self.exceptions = list(self.exceptions)
        self.shared_values = dict(self.shared_values)
        self.current_iterations = dict(self.current_iterations)
        self.processes = dict(self.processes)
        self._manager.shutdown()