formatting.py

from datetime import timedelta, datetime
from typing import *

import numpy as np
import pprint
from terminaltables import AsciiTable

from .config import Config, config_to_dict
from .metrics import MetricStats
from .utils import NOT_SET
from .typing_ import *


__all__ = [
    'format_key_values', 'format_duration', 'format_as_asctime',
    'MetricsFormatter',
]


def format_key_values(key_values: Union[Dict,
                                        Config,
                                        Iterable[Tuple[str, Any]]],
                      title: Optional[str] = None,
                      formatter: Callable[[Any], str] = pprint.pformat,
                      delimiter_char: str = '=',
                      sort_keys: bool = False
                      ) -> str:
    """
    Format key value sequence into str.

    The basic usage, to format a :class:`Config`, a dict or a list of tuples:

    >>> print(format_key_values(Config(a=123, b=Config(value=456))))
    a         123
    b.value   456
    >>> print(format_key_values({'a': 123, 'b': {'value': 456}}))
    a   123
    b   {'value': 456}
    >>> print(format_key_values([('a', 123), ('b', {'value': 456})]))
    a   123
    b   {'value': 456}

    To add a title and a delimiter:

    >>> print(format_key_values(Config(a=123, b=Config(value=456)),
    ...                         title='short title'))
    short title
    =============
    a         123
    b.value   456
    >>> print(format_key_values({'a': 123, 'b': {'value': 456}},
    ...                         title='long long long title'))
    long long long title
    ====================
    a   123
    b   {'value': 456}

    Args:
        key_values: The sequence of key values, may be a :class:`Config`,
            a dict, or a list of (key, value) pairs.
            If it is a :class:`Config`, it will be flatten via
            :meth:`Config.to_flatten_dict()`.
        title: If specified, will prepend a title and a horizontal delimiter
            to the front of returned string.
        formatter: The function to format values.
        delimiter_char: The character to use for the delimiter between title
            and config key values.
        sort_keys: Whether to sort keys?

    Returns:
        The formatted str.
    """
    if len(delimiter_char) != 1:
        raise ValueError(f'`delimiter_char` must be one character: '
                         f'got {delimiter_char!r}')

    if isinstance(key_values, Config):
        key_values = config_to_dict(key_values, flatten=True)

    if hasattr(key_values, 'items'):
        data = [(key, formatter(value)) for key, value in key_values.items()]
    else:
        data = [(key, formatter(value)) for key, value in key_values]

    if sort_keys:
        data.sort(key=lambda v: v[0])

    # use the terminaltables.AsciiTable to format our key values
    table = AsciiTable(data)
    table.padding_left = 0
    table.padding_right = 3
    table.inner_column_border = False
    table.inner_footing_row_border = False
    table.inner_heading_row_border = False
    table.inner_row_border = False
    table.outer_border = False
    lines = [line.rstrip() for line in table.table.split('\n')]

    # prepend a title
    if title is not None:
        max_length = max(max(map(len, lines)), len(title))
        delim = delimiter_char * max_length
        lines = [title, delim] + lines

    return '\n'.join(lines)


def format_duration(duration: Union[float, int, timedelta],
                    precision: int = 0,
                    count_down: bool = False) -> str:
    """
    Format given time duration as human readable text.

    >>> format_duration(0)
    '0s'
    >>> format_duration(0, count_down=True)
    '0s'
    >>> format_duration(-1)
    '1s ago'
    >>> format_duration(-1, count_down=True)
    '1s ago'
    >>> format_duration(0.01, precision=2)
    '0.01s'
    >>> format_duration(0.01, precision=2, count_down=True)
    '0.01s'
    >>> format_duration(1.00, precision=2)
    '1s'
    >>> format_duration(1.00, precision=2, count_down=True)
    '1s'
    >>> format_duration(1.125)
    '1s'
    >>> format_duration(1.125, count_down=True)
    '1s'
    >>> format_duration(1.1251, precision=2)
    '1.13s'
    >>> format_duration(1.1251, precision=2, count_down=True)
    '1.13s'
    >>> format_duration(1.51)
    '2s'
    >>> format_duration(1.51, count_down=True)
    '2s'
    >>> format_duration(10)
    '10s'
    >>> format_duration(59.99, precision=2)
    '59.99s'
    >>> format_duration(59.99, precision=2, count_down=True)
    '59.99s'
    >>> format_duration(59.99)
    '1m'
    >>> format_duration(59.99, count_down=True)
    '1:00'
    >>> format_duration(60)
    '1m'
    >>> format_duration(60, count_down=True)
    '1:00'
    >>> format_duration(61)
    '1m 1s'
    >>> format_duration(61, count_down=True)
    '1:01'
    >>> format_duration(3600)
    '1h'
    >>> format_duration(3600, count_down=True)
    '1:00:00'
    >>> format_duration(86400)
    '1d'
    >>> format_duration(86400, count_down=True)
    '1d 00:00:00'
    >>> format_duration(86400 + 7200 + 180 + 4)
    '1d 2h 3m 4s'
    >>> format_duration(86400 + 7200 + 180 + 4, count_down=True)
    '1d 02:03:04'
    >>> format_duration(timedelta(days=1, hours=2, minutes=3, seconds=4))
    '1d 2h 3m 4s'
    >>> format_duration(timedelta(days=1, hours=2, minutes=3, seconds=4),
    ...                 count_down=True)
    '1d 02:03:04'

    Args:
        duration: The number of seconds, or a :class:`timedelta` object.
        precision: Precision of the seconds (i.e., number of digits to print).
        count_down: Whether or not to use the "count-down" format?  (i.e.,
            time will be formatted as "__:__:__" instead of "__h __m __s".)

    Returns:
        The formatted text.
    """
    if isinstance(duration, timedelta):
        duration = duration.total_seconds()
    else:
        duration = duration
    is_ago = duration < 0
    duration = round(abs(duration), precision)

    if count_down:
        # format the time str as "__:__:__.__"
        def format_time(seconds, has_days_part):
            # first of all, extract the hours and minutes part
            residual = []
            for unit in (3600, 60):
                residual.append(int(seconds // unit))
                seconds = seconds - residual[-1] * unit

            # format the hours and minutes
            segments = []
            for r in residual:
                if not segments and not has_days_part:
                    if r != 0:
                        segments.append(str(r))
                else:
                    segments.append(f'{r:02d}')

            # break seconds into int and real number part
            seconds_int = int(seconds)
            seconds_real = seconds - seconds_int

            # format the seconds
            if segments:
                seconds_int = f'{seconds_int:02d}'
            else:
                seconds_int = str(seconds_int)
            seconds_real = f'{seconds_real:.{precision}f}'.strip('0')
            if seconds_real == '.':
                seconds_real = ''
            seconds_suffix = 's' if not segments else ''
            segments.append(f'{seconds_int}{seconds_real}{seconds_suffix}')

            # now compose the final time str
            return ':'.join(segments)
    else:
        # format the time as "__h __m __s"
        def format_time(seconds, has_days_part):
            ret = []
            for u, s in [(3600, 'h'), (60, 'm')]:
                if seconds >= u:
                    v = int(seconds // u)
                    seconds -= v * u
                    ret.append(f'{v}{s}')
            if seconds > 1e-8:
                # seconds_int = int(seconds)
                seconds_str = f'{seconds:.{precision}f}'
                if '.' in seconds_str:
                    seconds_str = seconds_str.rstrip('0').rstrip('.')
                ret.append(f'{seconds_str}s')

            if not has_days_part and not ret:
                ret.append('0s')

            return ' '.join(ret)

    if duration < 86400:
        # less then one day, just format the time
        ret = format_time(duration, has_days_part=False)
    else:
        # equal or more than one day, format the days and the time
        days = int(duration // 86400)
        duration = duration - days * 86400
        time_str = format_time(duration, has_days_part=True)
        if time_str:
            time_str = ' ' + time_str
        ret = f'{days}d{time_str}'

    if is_ago:
        ret = f'{ret} ago'

    return ret


def format_as_asctime(dt: datetime,
                      datetime_format: str = '%Y-%m-%d %H:%M:%S',
                      msec_format: str = '%03d',
                      datetime_msec_sep: str = ',') -> str:
    """
    Format datetime `dt` using the `asctime` format of the logging module.

    >>> format_as_asctime(datetime.utcfromtimestamp(1576755571.662434))
    '2019-12-19 11:39:31,662'
    >>> format_as_asctime(datetime.utcfromtimestamp(1576755571.662434),
    ...                   datetime_format='%Y-%m-%d_%H-%M-%S',
    ...                   datetime_msec_sep='_')
    '2019-12-19_11-39-31_662'

    Args:
        dt: The datetime object.
        datetime_format: The format str for the datetime part (i.e.,
            year, month, day, hour, minute, second).
        msec_format: The format str for the milliseconds part.
        datetime_msec_sep: The separator between the datetime str and the
            milliseconds str.

    Returns:
        The formatted datetime and milliseconds.
    """
    msec = int(round(dt.microsecond / 1000))
    datetime_str = dt.strftime(datetime_format)
    msec_str = msec_format % msec
    return f'{datetime_str}{datetime_msec_sep}{msec_str}'


class MetricsFormatter(object):
    """
    Class to sort and format metric statistics into string.

    >>> fmt = MetricsFormatter()
    >>> fmt.format({
    ...     'val_loss': 1.25,
    ...     'train_loss': {'mean': 1.333333333333, 'std': 0.6666666666667},
    ...     'train_time': MetricStats(mean=2, var=2, std=1.4142135623730951),
    ... })
    'train_loss: 1.33333 (±0.666667); val_loss: 1.25; train_time: 2s (±1.414s)'
    """

    SEPARATORS: Tuple[str, str] = (': ', '; ')
    """
    Default separators, where the first is the separator between the 
    name of a metric and its value, and the second is the separator between
    different metrics.
    """

    def _metric_sort_key(self, name):
        parts = name.split('_')
        prefix_order = {'train': 0, 'val': 1, 'valid': 2, 'test': 3,
                        'pred': 4, 'predict': 5, 'epoch': 6, 'batch': 7}
        suffix_order = {'time': 9998, 'timer': 9999}
        return (suffix_order.get(parts[-1], 0), prefix_order.get(parts[0], 0),
                name)

    def _format_value(self, name: str, val: Any) -> str:
        if np.shape(val) == ():
            name_suffix = name.lower().rsplit('_', 1)[-1]
            if name_suffix in ('time', 'timer'):
                return format_duration(val, precision=3)
            else:
                return f'{float(val):.6g}'
        else:
            return str(val)

    def sorted_names(self, names: Iterable[str]) -> List[str]:
        """
        Sort the metric names.

        Args:
            names: The metric names.

        Returns:
            The sorted metric names.
        """
        return sorted(names, key=self._metric_sort_key)

    def format_metric(self, name: str, val: Any, sep: str = NOT_SET) -> str:
        """
        Format a named metric.

        >>> fmt = MetricsFormatter()
        >>> fmt.format_metric('loss', 1.25, ': ')
        'loss: 1.25'
        >>> fmt.format_metric('acc', {'mean': 0.875, 'std': 0.125}, ' = ')
        'acc = 0.875 (±0.125)'
        >>> fmt.format_metric('epoch_time', MetricStats(mean=2.5, std=1, var=1))
        'epoch_time: 2.5s (±1s)'
        >>> fmt.format_metric('value', {'mean': np.array([1, 2]), 'std': None})
        'value: [1 2]'

        Args:
            name: Name of the metric.
            val: Value of the metric, may be a number, a dict of
                ``{'mean': ..., 'std': ...}``, or an instance of
                :class:`MetricStats`.
            sep: The separator between the name and the value.
                If not specified, use ``self.DELIMIETERS[0]``.

        Returns:
            The formatted metric.
        """
        if sep is NOT_SET:
            sep = self.SEPARATORS[0]

        # if `val` is a dict with "mean" and "std"
        if isinstance(val, dict) and 'mean' in val and \
                (len(val) == 1 or (len(val) == 2 and 'std' in val)):
            mean, std = val['mean'], val.get('std')
        # elif `val` is a MetricStats object
        elif isinstance(val, MetricStats):
            mean, std = val.mean, val.std
        # else we treat `val` as a simple value
        else:
            mean, std = val, None

        # format the value part
        if std is None:
            val_str = self._format_value(name, mean)
        else:
            val_str = f'{self._format_value(name, mean)} ' \
                      f'(±{self._format_value(name, std)})'

        # now construct the final str
        return f'{name}{sep}{val_str}'

    def format(self,
               metrics: Mapping[str, Union[MetricValue,
                                           Mapping[str, MetricValue],
                                           MetricStats]],
               known_names: Optional[Sequence[str]] = None,
               sep: Tuple[str, str] = NOT_SET) -> str:
        """
        Format the given metrics.

        >>> fmt = MetricsFormatter()
        >>> fmt.format({
        ...     'acc': 0.75, 'loss': {'mean': 0.875, 'std': 0.125},
        ...     'train_time': 1.5
        ... }, known_names=['loss'], sep=(' = ', ' | '))
        'loss = 0.875 (±0.125) | acc = 0.75 | train_time = 1.5s'

        Args:
            metrics: A dict of metric values or statistics.
            known_names: Known metric names.  These metrics will be placed
                in front of other unknown metrics.
            sep: The first str is the separator between a name of metric
                and its value, while the second str is the separator
                between different metrics.  Defaults to ``self.SEPARATORS``.

        Returns:
            The formatted metrics.
        """
        if sep is NOT_SET:
            sep = self.SEPARATORS

        buf = []
        name_val_sep, metrics_sep = sep
        fmt = lambda name: self.format_metric(name, metrics[name], name_val_sep)

        # format the metrics with known names (thus preserving the known orders)
        for name in (known_names or ()):
            if name in metrics:
                buf.append(fmt(name))

        # format the metrics with unknown names (sorted by `sorted_names`)
        known_names = set(known_names or ())
        for name in self.sorted_names(metrics):
            if name not in known_names:
                buf.append(fmt(name))

        return metrics_sep.join(buf)