Source code for mindinsight.debugger.api.conditions

# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Watchpoints."""
from abc import ABC
from enum import Enum

from mindinsight.debugger.api.debugger_tensor import DebuggerTensor
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
    DebuggerParamTypeError
from mindinsight.debugger.common.utils import validate_type
from mindinsight.debugger.conditionmgr.condition import ParamNameEnum


[docs]class ConditionBase(ABC):
    """
    Base class for conditions.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Note:
        - If multiple checking parameters is specified for one condition instance,
          a WatchpointHit happens for the parameters that the tensor triggered for the watchpoint.

    Examples:
            >>> from mindinsight.debugger import DumpAnalyzer
            >>> from mindinsight.debugger import (TensorTooLargeCondition,
            ...                                   Watchpoint)
            >>>
            >>> def test_condition_base():
            ...     my_run = DumpAnalyzer(dump_dir="/path/to/your/dump_dir_with_dump_data")
            ...     tensors = my_run.select_tensors(query_string="Conv2D-op13")
            ...     watchpoint = Watchpoint(tensors=tensors,
            ...                             condition=TensorTooLargeCondition(abs_mean_gt=0.0, max_gt=0.0))
            ...     hit = list(my_run.check_watchpoints(watchpoints=[watchpoint]))[0]
            ...     # print(hit.get_hit_detail())
            ...     # the print result is as follows
            ...     # The setting for watchpoint is abs_mean_gt = 0.0, max_gt = 0.0.
            ...     # The actual value of the tensor is abs_mean_gt = 0.06592023578438996, max_gt = 0.449951171875.
            ...     watchpoint = Watchpoint(tensors=tensors,
            ...                             condition=TensorTooLargeCondition(abs_mean_gt=0.0, max_gt=1.0))
            ...     # the check_watchpoints function start a new process needs to be called through the main entry
            ...     hit = list(my_run.check_watchpoints(watchpoints=[watchpoint]))[0]
            ...     # print(hit.get_hit_detail())
            ...     # the print result is as follows
            ...     # The setting for watchpoint is abs_mean_gt = 0.0.
            ...     # The actual value of the tensor is abs_mean_gt = 0.06592023578438996.
            ...
            >>> if __name__ == "__main__":
            ...     test_condition_base()
            ...
    """

    @property
    def name(self):
        """
        Get the name for the condition.

        Returns:
            str, the name of the condition.
        """
        raise NotImplementedError

    @property
    def condition_id(self):
        """
        Get the name for the condition Id.

        Returns:
            int, the id of the condition.
        """
        raise NotImplementedError

    @property
    def param_dict(self):
        """
        Get the parameters list.

        Returns:
            dict, the parameter dict of the condition.
        """
        return {}

    def __str__(self):
        return str(self.param_dict)


[docs]class WatchpointHit(ABC):
    """
    Watchpoint hit.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Note:
        - This class is not meant to be instantiated by user.
        - The instances of this class is immutable.

    Examples:
        >>> from mindinsight.debugger import DumpAnalyzer
        >>> from mindinsight.debugger import TensorTooLargeCondition, Watchpoint
        >>>
        >>> def test_watch_point_hit():
        ...     my_run = DumpAnalyzer(dump_dir="/path/to/your/dump_dir_with_dump_data")
        ...     tensor_list = my_run.select_tensors(
        ...                                         query_string="Conv",
        ...                                         use_regex=True,
        ...                                         iterations=[0],
        ...                                         ranks=[0],
        ...                                         slots=[0]
        ...                                         )
        ...     watchpoint = Watchpoint(tensors=tensor_list,
        ...                             condition=TensorTooLargeCondition(abs_mean_gt=0.0))
        ...     # the check_watchpoints function start a new process needs to be called through the main entry
        ...     hits = my_run.check_watchpoints(watchpoints=[watchpoint])
        ...     hit = list(hits)[0]
        ...     # print(str(hit))
        ...     # the print result is as follows
        ...     # Watchpoint TensorTooLarge triggered on tensor:
        ...     # rank: 0
        ...     # graph_name: kernel_graph_0
        ...     # node_name: Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Cast-op7
        ...     # slot: 0
        ...     # iteration: 0
        ...     # Threshold: {'abs_mean_gt': 0.0}
        ...     # Hit detail: The setting for watchpoint is abs_mean_gt = 0.0.
        ...     # The actual value of the tensor is abs_mean_gt = 0.007956420533235841.
        ...     # print(hit.error_code)
        ...     # the print result is as follows
        ...     # 0
        ...     # print(hit.tensor)
        ...     # the print result is as follows
        ...     # rank: 0
        ...     # graph_name: kernel_graph_0
        ...     # node_name: Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Cast-op7
        ...     # slot: 0
        ...     # iteration: 0
        ...     # print(hit.get_hit_detail())
        ...     # the print result is as follows
        ...     # The setting for watchpoint is abs_mean_gt = 0.0.
        ...     # The actual value of the tensor is abs_mean_gt = 0.007956420533235841.
        ...
        >>> if __name__ == "__main__":
        ...     test_watch_point_hit()
        ...
    """

    @property
    def error_code(self):
        """
        Get the error code when checking the watchpoint if there is error.

        Returns:
            int, the error number.
        """
        raise NotImplementedError

    @property
    def error_msg(self):
        """
        Get the error msg when checking the watchpoint if there is error.

        Returns:
            list[str], the error message list.
        """
        raise NotImplementedError

    @property
    def tensor(self) -> DebuggerTensor:
        """
        Get the tensor for this watchpoint hit.

        Returns:
            DebuggerTensor, the triggered tensor.
        """
        raise NotImplementedError

[docs]    def get_threshold(self):
        """
        Get the condition set by user.

        Returns:
            ConditionBase, the condition with user threshold, see info with str(ConditionBase).
        """
        raise NotImplementedError

[docs]    def get_hit_detail(self):
        """
        Get the actual values for the thresholds in the watchpoint.
        If error_code is not zero, None will be returned.

        Returns:
            Union[ConditionBase, None], the condition with hit detail, If error_code is not zero,
            None will be returned, see info with str(ConditionBase).
        """
        raise NotImplementedError


class WatchpointHitImpl(WatchpointHit):
    """
    Watchpoint hit.

    Args:
        tensor (DebuggerTensor): The tensor which hits the watchpoint.
        condition (ConditionBase): The ConditionBase object initialized with
            user setting value.
        hit_detail (ConditionBase): The ConditionBase object
            initialized with actual value of the Tensor.
        error_code (int): The code describing error.
    """

    def __init__(self,
                 tensor: DebuggerTensor,
                 condition: ConditionBase,
                 hit_detail: ConditionBase,
                 error_code):
        self._tensor = tensor
        self._condition = condition
        self._error_code = error_code
        self._hit_detail = hit_detail

    @property
    def error_code(self):
        """
        Get the error code when checking the watchpoint if there is error.

        Returns:
            int, the error number.
        """
        return self._error_code

    @property
    def error_msg(self):
        """
        Get the error msg when checking the watchpoint if there is error.

        Returns:
            list[str], the error message list.
        """
        error_code = self._error_code
        all_error_list = [
            "Tensor contains NaN.",
            "A tensor contains +/-INF.",
            "The previous step value cannot be found.",
            "The tensor size exceeds the memory limit.",
            "Graph history file is not available.",
            "Tensor has no value."
        ]
        error_list = []
        for i, error_str in enumerate(all_error_list):
            error = (error_code >> i) & 1
            if error == 1:
                error_list.append(error_str)

        return error_list

    @property
    def tensor(self) -> DebuggerTensor:
        """Get the tensor for this watchpoint hit."""
        return self._tensor

    def get_threshold(self):
        """Get the threshold set by user."""
        return self._condition

    def get_hit_detail(self):
        """
        Get the actual values for the thresholds in the watchpoint.
        If error_code is not zero or None, None will be returned.
        """
        if self._error_code:
            return None
        return self._hit_detail

    def __str__(self):
        if self._error_code:
            msg = f"Watchpoint {self._condition.name} check failed on tensor:\n" \
                  f"{str(self.tensor)}" \
                  f"Threshold: {self.get_threshold()}\n" \
                  f"Error detail: {self.error_msg}"
            return msg
        msg = f"Watchpoint {self._condition.name} triggered on tensor:\n" \
              f"{str(self.tensor)}" \
              f"Threshold: {self.get_threshold()}\n" \
              f"Hit detail: {str(self._hit_detail)}"
        return msg


class HitDetail(ConditionBase):
    """Hit Detail."""

    def __init__(self, param_list, condition):
        self._param_list = param_list
        self._condition = condition

    @property
    def name(self):
        """Get the name for the condition."""
        return self._condition.name

    @property
    def condition_id(self):
        """Get the name for the condition Id."""
        return self._condition.condition_id

    @property
    def param_dict(self):
        """Get the parameters list."""
        return self._param_list

    def __str__(self):
        show_actual_value = bool(self._condition.param_dict)
        if self._condition.condition_id == WatchpointConditionId.UNCHANGED_TENSOR.value:
            show_actual_value = False
        # list of the parameters with disabled = False and hit = 1
        hit_param_list = []
        for param in self._param_list:
            if not param.disabled and param.hit:
                hit_param_list.append(param)

        result = ""
        param_size = len(hit_param_list)
        if show_actual_value and hit_param_list:
            setting_detail = "The setting for watchpoint is "
            value_detail = " The actual value of the tensor is "
            for idx, param in enumerate(hit_param_list):
                setting_detail += f"{param.name} = {param.value}"
                value_detail += f"{param.name} = {param.actual_value}"
                if idx == param_size - 1:
                    setting_detail += "."
                    value_detail += "."
                else:
                    setting_detail += ", "
                    value_detail += ", "
            result = setting_detail + value_detail

        if not result:
            result = "None."
        return result


[docs]class TensorTooLargeCondition(ConditionBase):
    """
    Tensor too large watchpoint. At least one parameter should be specified.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        abs_mean_gt (float, optional): The threshold for mean of the absolute
            value of the tensor. When the actual value was greater than this
            threshold, this checking condition would be satisfied.
        max_gt (float, optional): The threshold for maximum of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.
        min_gt (float, optional): The threshold for minimum of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.
        mean_gt (float, optional): The threshold for mean of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.

    Examples:
        >>> from mindinsight.debugger import TensorTooLargeCondition
        >>> my_condition = TensorTooLargeCondition(abs_mean_gt=0.0)
        >>> print(my_condition.name)
        TensorTooLarge
    """

    def __init__(self,
                 abs_mean_gt=None, max_gt=None, min_gt=None, mean_gt=None):
        self._abs_mean_gt = abs_mean_gt
        self._max_gt = max_gt
        self._min_gt = min_gt
        self._mean_gt = mean_gt
        self._param_dict = self._get_param_dict()

    @property
    def name(self):
        return "TensorTooLarge"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_TOO_LARGE.value

    @property
    def param_dict(self):
        return self._param_dict

    def _get_param_dict(self):
        """Get normalized param dict."""
        param_dict = {}
        if self._abs_mean_gt is not None:
            validate_type(self._abs_mean_gt, 'abs_mean_gt', [int, float], 'float')
            param_dict[ParamNameEnum.ABS_MEAN_GT.value] = float(self._abs_mean_gt)
        if self._max_gt is not None:
            validate_type(self._max_gt, 'max_gt', [int, float], 'float')
            param_dict[ParamNameEnum.MAX_GT.value] = float(self._max_gt)
        if self._min_gt is not None:
            validate_type(self._min_gt, 'min_gt', [int, float], 'float')
            param_dict[ParamNameEnum.MIN_GT.value] = float(self._min_gt)
        if self._mean_gt is not None:
            validate_type(self._mean_gt, 'mean_gt', [int, float], 'float')
            param_dict[ParamNameEnum.MEAN_GT.value] = float(self._mean_gt)
        if not param_dict:
            msg = "Please specify at least one of the parameters for TensorTooLargeCondition."
            raise DebuggerParamValueError(msg)
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.ABS_MEAN_GT.value,
            ParamNameEnum.MAX_GT.value,
            ParamNameEnum.MIN_GT.value,
            ParamNameEnum.MEAN_GT.value
        ]
        return names


[docs]class TensorTooSmallCondition(ConditionBase):
    """
    Tensor too small watchpoint. At least one parameter should be specified.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        abs_mean_lt (float, optional): The threshold for mean of the absolute
            value of the tensor. When the actual value was less than this
            threshold, this checking condition would be satisfied.
        max_lt (float, optional): The threshold for maximum of the tensor. When
            the actual value was less than this threshold, this checking
            condition would be satisfied.
        min_lt (float, optional): The threshold for minimum of the tensor. When
            the actual value was less than this threshold, this checking
            condition would be satisfied.
        mean_lt (float, optional): The threshold for mean of the tensor. When
            the actual value was less than this threshold, this checking
            condition would be satisfied.

    Examples:
        >>> from mindinsight.debugger import TensorTooSmallCondition
        >>> my_condition = TensorTooSmallCondition(abs_mean_lt=0.2)
        >>> print(my_condition.name)
        TensorTooSmall
    """

    def __init__(self,
                 abs_mean_lt=None, max_lt=None, min_lt=None, mean_lt=None):
        self._abs_mean_lt = abs_mean_lt
        self._max_lt = max_lt
        self._min_lt = min_lt
        self._mean_lt = mean_lt
        self._param_dict = self._get_param_dict()

    @property
    def name(self):
        return "TensorTooSmall"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_TOO_SMALL.value

    @property
    def param_dict(self):
        return self._param_dict

    def _get_param_dict(self):
        """Get normalized param dict."""
        param_dict = {}
        if self._abs_mean_lt is not None:
            validate_type(self._abs_mean_lt, 'abs_mean_lt', [int, float], 'float')
            param_dict[ParamNameEnum.ABS_MEAN_LT.value] = float(self._abs_mean_lt)
        if self._max_lt is not None:
            validate_type(self._max_lt, 'max_lt', [int, float], 'float')
            param_dict[ParamNameEnum.MAX_LT.value] = float(self._max_lt)
        if self._min_lt is not None:
            validate_type(self._min_lt, 'min_lt', [int, float], 'float')
            param_dict[ParamNameEnum.MIN_LT.value] = float(self._min_lt)
        if self._mean_lt is not None:
            validate_type(self._mean_lt, 'mean_lt', [int, float], 'float')
            param_dict[ParamNameEnum.MEAN_LT.value] = float(self._mean_lt)
        if not param_dict:
            msg = "Please specify at least one of the parameters for TensorTooSmallCondition."
            raise DebuggerParamValueError(msg)
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.ABS_MEAN_LT.value,
            ParamNameEnum.MAX_LT.value,
            ParamNameEnum.MIN_LT.value,
            ParamNameEnum.MEAN_LT.value
        ]
        return names


[docs]class TensorRangeCondition(ConditionBase):
    """
    Tensor range watchpoint.

    Set a threshold to check the tensor value range. There are four options:
    range_percentage_lt, range_percentage_gt,  max_min_lt and max_min_gt.
    At least one of the four options should be specified.
    If the threshold is set to one of the first two options,
    then both range_start_inclusive and range_end_inclusive must be set.
    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        range_percentage_lt (float, optional): The threshold for the
            percentage of the tensor in the range. The checking condition will be satisfied
            when the percentage of the tensor in the specified range is less than this value.
        range_percentage_gt (float, optional): The threshold for the
            percentage of the tensor in the range. The checking condition will be satisfied
            when the percentage of the tensor in the specified range is greater than this value.
        max_min_lt (float, optional): Threshold for the difference of
            max and min of a tensor less than this value.
        max_min_gt (float, optional): Threshold for the difference of
            max and min of a tensor greater than this value.
        range_start_inclusive (float, optional): The start of the range.
        range_end_inclusive (float, optional): The end of the range.

    Examples:
        >>> from mindinsight.debugger import TensorRangeCondition
        >>> my_condition = TensorRangeCondition(max_min_gt=0.05)
        >>> print(my_condition.name)
        TensorRange
    """

    def __init__(self,
                 range_start_inclusive=None, range_end_inclusive=None, range_percentage_lt=None,
                 range_percentage_gt=None, max_min_lt=None, max_min_gt=None):
        self._range_start_inclusive = range_start_inclusive
        self._range_end_inclusive = range_end_inclusive
        self._range_percentage_lt = range_percentage_lt
        self._range_percentage_gt = range_percentage_gt
        self._max_min_lt = max_min_lt
        self._max_min_gt = max_min_gt
        self._param_dict = self._get_param_dict()

    @property
    def name(self):
        return "TensorRange"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_RANGE.value

    @property
    def param_dict(self):
        return self._param_dict

    def _get_param_dict(self):
        """Get normalized param dict."""
        param_dict = {}
        if self._range_start_inclusive is not None:
            validate_type(self._range_start_inclusive, 'range_start_inclusive', [int, float], 'float')
            param_dict[ParamNameEnum.RANGE_START_INCLUSIVE.value] = float(self._range_start_inclusive)
        if self._range_end_inclusive is not None:
            validate_type(self._range_end_inclusive, 'range_end_inclusive', [int, float], 'float')
            param_dict[ParamNameEnum.RANGE_END_INCLUSIVE.value] = float(self._range_end_inclusive)
        if self._range_percentage_lt is not None:
            validate_type(self._range_percentage_lt, 'range_percentage_lt', [int, float], 'float')
            param_dict[ParamNameEnum.RANGE_PERCENTAGE_LT.value] = float(self._range_percentage_lt)
        if self._range_percentage_gt is not None:
            validate_type(self._range_percentage_gt, 'range_range_percentage_gt', [int, float], 'float')
            param_dict[ParamNameEnum.RANGE_PERCENTAGE_GT.value] = float(self._range_percentage_gt)
        if self._max_min_lt is not None:
            validate_type(self._max_min_lt, 'max_min_lt', [int, float], 'float')
            param_dict[ParamNameEnum.MAX_MIN_LT.value] = float(self._max_min_lt)
        if self._max_min_gt is not None:
            validate_type(self._max_min_gt, 'max_min_gt', [int, float], 'float')
            param_dict[ParamNameEnum.MAX_MIN_GT.value] = float(self._max_min_gt)
        if not self._has_threshold_param(param_dict):
            msg = "Please specify at least one of the parameters " \
                  "[range_percentage_lt, range_percentage_gt, max_min_lt, max_min_gt] " \
                  "for TensorRangeCondition."
            raise DebuggerParamValueError(msg)
        # check supported parameter
        if (ParamNameEnum.RANGE_PERCENTAGE_LT.value in param_dict.keys() or
                ParamNameEnum.RANGE_PERCENTAGE_GT.value in param_dict.keys()):
            if (ParamNameEnum.RANGE_START_INCLUSIVE.value not in param_dict.keys() or
                    ParamNameEnum.RANGE_END_INCLUSIVE.value not in param_dict.keys()):
                msg = ("Please specify both range_start_inclusive and "
                       "range_end_inclusive parameters for TensorRangeCondition.")
                raise DebuggerParamValueError(msg)
        return param_dict

    @staticmethod
    def _has_threshold_param(param_dict):
        """Check if threshold parameter is set."""
        threshold_param_name = [
            ParamNameEnum.RANGE_PERCENTAGE_LT.value,
            ParamNameEnum.RANGE_PERCENTAGE_GT.value,
            ParamNameEnum.MAX_MIN_LT.value,
            ParamNameEnum.MAX_MIN_GT.value
        ]
        for param_name in threshold_param_name:
            if param_name in param_dict:
                return True
        return False

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.RANGE_START_INCLUSIVE.value,
            ParamNameEnum.RANGE_END_INCLUSIVE.value,
            ParamNameEnum.RANGE_PERCENTAGE_LT.value,
            ParamNameEnum.RANGE_PERCENTAGE_GT.value,
            ParamNameEnum.MAX_MIN_LT.value,
            ParamNameEnum.MAX_MIN_GT.value
        ]
        return names


[docs]class TensorOverflowCondition(ConditionBase):
    """
    Tensor overflow watchpoint.

    Tensor overflow whatchpoint checks for inf and nan tensors.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Examples:
        >>> from mindinsight.debugger import TensorOverflowCondition
        >>> my_condition = TensorOverflowCondition()
        >>> print(my_condition.name)
        TensorOverflow
    """

    def __init__(self):
        pass

    @property
    def name(self):
        return "TensorOverflow"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_OVERFLOW.value

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        return []


[docs]class OperatorOverflowCondition(ConditionBase):
    """
    Operator overflow watchpoint.

    Operator overflow whatchpoint checks whether overflow occurs during operator computation.
    Only Ascend AI processor is supported.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Examples:
        >>> from mindinsight.debugger import OperatorOverflowCondition
        >>> my_condition = OperatorOverflowCondition()
        >>> print(my_condition.name)
        OperatorOverflow
    """

    def __init__(self):
        pass

    @property
    def name(self):
        return "OperatorOverflow"

    @property
    def condition_id(self):
        return WatchpointConditionId.OPERATOR_OVERFLOW.value

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        return []


[docs]class TensorAllZeroCondition(ConditionBase):
    """
    Tensor all zero watchpoint

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        zero_percentage_ge (float): The threshold to check if the percentage of
            zero tensor values are greater than this value.

    Examples:
        >>> from mindinsight.debugger import TensorAllZeroCondition
        >>> my_condition = TensorAllZeroCondition(zero_percentage_ge=0.0)
        >>> print(my_condition.name)
        TensorAllZero
    """

    def __init__(self, zero_percentage_ge):
        validate_type(zero_percentage_ge, 'zero_percentage_ge', [int, float], 'float')
        self._zero_percentage_ge = float(zero_percentage_ge)

    @property
    def name(self):
        return "TensorAllZero"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_ALL_ZERO.value

    @property
    def param_dict(self):
        param_dict = {ParamNameEnum.ZERO_PERCENTAGE_GE.value: self._zero_percentage_ge}
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        return [ParamNameEnum.ZERO_PERCENTAGE_GE.value]


[docs]class TensorUnchangedCondition(ConditionBase):
    """
    Tensor unchanged condition watchpoint.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.
    Checks allclose function on previous and current tensor.
    (abs_mean(current_tensor - previous_tensor) <= (atol + rtol * abs_mean(previous_tensor)))

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        rtol (float, optional): The relative tolerance parameter. Default: 1e-5.
        atol (float, optional): The absolute tolerance parameter. Default: 1e-8.

    Examples:
        >>> from mindinsight.debugger import TensorUnchangedCondition
        >>> my_condition = TensorUnchangedCondition(rtol=1000.0)
        >>> print(my_condition.name)
        TensorUnchanged
    """

    def __init__(self, rtol=1e-5, atol=1e-8):
        validate_type(rtol, 'rtol', [float, int], 'float or int')
        validate_type(atol, 'atol', [float, int], 'float or int')
        self._rtol = float(rtol)
        self._atol = float(atol)

    @property
    def name(self):
        return "TensorUnchanged"

    @property
    def condition_id(self):
        return WatchpointConditionId.UNCHANGED_TENSOR.value

    @property
    def param_dict(self):
        param_dict = {
            ParamNameEnum.RTOL.value: self._rtol,
            ParamNameEnum.ATOL.value: self._atol}
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.RTOL.value,
            ParamNameEnum.ATOL.value,
            ParamNameEnum.EQUAL_NAN.value
        ]
        return names


[docs]class TensorChangeBelowThresholdCondition(ConditionBase):
    """
    Tensor change below threshold watchpoint.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.
    (abs_mean(current_tensor - previous_tensor) < epsilon + mean_update_ratio_lt * abs_mean(previous_tensor))

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        abs_mean_update_ratio_lt (float): The threshold value for mean update ration.
            If the mean update ratio is less that this value the watchpoint will be triggered.
        epsilon (float, optional): Epsilon value. Default: 1e-9.

    Examples:
        >>> from mindinsight.debugger import TensorChangeBelowThresholdCondition
        >>> my_condition = TensorChangeBelowThresholdCondition(abs_mean_update_ratio_lt=2.0)
        >>> print(my_condition.name)
        TensorChangeBelowThreshold
    """

    def __init__(self, abs_mean_update_ratio_lt, epsilon=1e-9):
        validate_type(abs_mean_update_ratio_lt, 'abs_mean_update_ratio_lt', [float, int], 'float')
        validate_type(epsilon, 'epsilon', [float, int], 'float')
        self._abs_mean_update_ratio_lt = float(abs_mean_update_ratio_lt)
        self._epsilon = float(epsilon)

    @property
    def name(self):
        return "TensorChangeBelowThreshold"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_CHANGE_TOO_SMALL.value

    @property
    def param_dict(self):
        param_dict = {
            ParamNameEnum.ABS_MEAN_UPDATE_RATIO_LT.value: self._abs_mean_update_ratio_lt,
            ParamNameEnum.EPSILON.value: self._epsilon
        }
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.ABS_MEAN_UPDATE_RATIO_LT.value,
            ParamNameEnum.EPSILON.value
        ]
        return names


[docs]class TensorChangeAboveThresholdCondition(ConditionBase):
    """
    Tensor change above threshold watchpoint.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.
    (abs_mean(current_tensor - previous_tensor) > epsilon + mean_update_ratio_gt * abs_mean(previous_tensor))

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or deletion.

    Args:
        abs_mean_update_ratio_gt (float): The threshold value for mean update ratio,
            if the mean update ratio is greater than this value the watchpoint will be triggered.
        epsilon (float, optional): Epsilon value. Default: 1e-9.

    Examples:
        >>> from mindinsight.debugger import TensorChangeAboveThresholdCondition
        >>> my_condition = TensorChangeAboveThresholdCondition(abs_mean_update_ratio_gt=0.0)
        >>> print(my_condition.name)
        TensorChangeAboveThreshold
    """

    def __init__(self, abs_mean_update_ratio_gt, epsilon=1e-9):
        validate_type(abs_mean_update_ratio_gt, 'abs_mean_update_ratio_gt', [float, int], 'float')
        validate_type(epsilon, 'epsilon', [float, int], 'float')
        self._abs_mean_update_ratio_gt = float(abs_mean_update_ratio_gt)
        self._epsilon = float(epsilon)

    @property
    def name(self):
        return "TensorChangeAboveThreshold"

    @property
    def condition_id(self):
        return WatchpointConditionId.TENSOR_CHANGE_TOO_LARGE.value

    @property
    def param_dict(self):
        param_dict = {
            ParamNameEnum.ABS_MEAN_UPDATE_RATIO_GT.value: self._abs_mean_update_ratio_gt,
            ParamNameEnum.EPSILON.value: self._epsilon
        }
        return param_dict

    @property
    def param_names(self):
        """
        Return the list of parameter names.

        Returns:
            list[str], the parameter names.
        """
        names = [
            ParamNameEnum.ABS_MEAN_UPDATE_RATIO_GT.value,
            ParamNameEnum.EPSILON.value
        ]
        return names


[docs]class Watchpoint:
    """
    Watchpoint applies condition to specified tensors.

    .. warning::
        All APIs in this class are experimental prototypes that are subject to
        change or delete.

    Args:
        tensors (Iterable[DebuggerTensor]): The tensors to check.
        condition (ConditionBase): The condition to apply to tensors.

    Examples:
        >>> from mindinsight.debugger import DumpAnalyzer
        >>> from mindinsight.debugger import TensorTooLargeCondition, Watchpoint
        >>> my_run = DumpAnalyzer(dump_dir="/path/to/your/dump_dir_with_dump_data")
        >>> tensor_list = my_run.select_tensors(
        ...                                     query_string="Conv",
        ...                                     use_regex=True,
        ...                                     iterations=[0],
        ...                                     ranks=[0],
        ...                                     slots=[0]
        ...                                     )
        >>> watchpoint = Watchpoint(tensors=tensor_list,
        ...                         condition=TensorTooLargeCondition(abs_mean_gt=0.0))
        >>> tensor = list(watchpoint.tensors)[0]
        >>> print(tensor.node.name)
        Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Cast-op7
        >>> print(watchpoint.condition.name)
        TensorTooLarge
    """

    def __init__(self, tensors, condition):
        validate_tensor_list(tensors, 'tensors')
        validate_type(condition, 'condition', ConditionBase, 'ConditionBase')
        self._tensors = tensors
        self._condition = condition

    @property
    def tensors(self):
        """
        Get tensors to check.

        Returns:
            Iterable[DebuggerTensor]), the tensors to check.
        """
        return self._tensors

    @property
    def condition(self):
        """
        Get the condition to apply to tensors.

        Returns:
            ConditionBase, the condition to apply to tensors.
        """
        return self._condition


class WatchpointHandle:
    """Watchpoint handle."""

    def __init__(self, watchpoint_id, watchpoint):
        validate_type(watchpoint, 'watchpoint', Watchpoint, 'Watchpoint')
        self.watchpoint_id = watchpoint_id
        self.condition = watchpoint.condition
        self.sorted_tensors = self._organize_tensor(watchpoint.tensors)
        self.tensors = watchpoint.tensors

    @staticmethod
    def _get_value(default_map, key, default_value):
        """Get key in default map."""
        value = default_map.get(key)
        if value is None:
            value = default_value
            default_map[key] = value
        return value

    def _organize_tensor(self, tensors):
        """Sort out the tensor and remove the duplication."""
        sorted_tensor = {}
        for tensor in tensors:
            validate_type(tensor, 'tensors', DebuggerTensor, 'List[DebuggerTensor]')
            node_map = self._get_value(sorted_tensor, tensor.iteration, {})
            slot_map = self._get_value(node_map, tensor.node.unique_id, {
                'node': tensor.node,
                'slot_map': {}
            }).get('slot_map')
            slot_map[tensor.slot] = tensor
        return sorted_tensor

    def get_iterations(self):
        """Get iterations to be check in this watchpoint."""
        return list(self.sorted_tensors.keys())

    def need_check(self, tensor):
        """Check if the tensor need to be checked."""
        slot_map = self.sorted_tensors.get(tensor.iteration,
                                           {}).get(tensor.node.unique_id, {}).get('slot_map')
        if slot_map.get(tensor.slot) is not None:
            return True
        return False

    def get_check_nodes(self, iteration):
        """Get check nodes."""
        if iteration is None:
            return {}
        check_nodes = {}
        for node_info in self.sorted_tensors.get(iteration, {}).values():
            node = node_info.get('node')
            node_name = node.full_name_with_graph
            check_node = self._get_value(check_nodes, node_name, {
                "rank_id": [node.rank],
                "is_output": True,
                "root_graph_id": [node.root_graph_id]
            })
            if node.rank not in check_node.get('rank_id'):
                check_node["rank_id"].append(node.rank)
        return check_nodes

    def add_watchpoint(self, iteration, debugger_engine):
        """
        Add watchpoint for the selected iteration.
        """
        check_nodes = self.get_check_nodes(iteration)
        # check if watchpoint must be added for the current iteration
        if check_nodes:
            params = self._get_param_list(debugger_engine.dbg_services_module.Parameter)
            debugger_engine.dbg_service.add_watchpoint(
                watchpoint_id=self.watchpoint_id,
                watch_condition=self.condition.condition_id,
                check_node_list=check_nodes,
                parameter_list=params
            )

    def _get_param_list(self, parameter_class):
        """Get param list."""
        params = []
        set_params = self.condition.param_dict
        for param_name in self.condition.param_names:
            set_value = set_params.get(param_name)
            if set_value is not None:
                param = parameter_class(name=param_name, disabled=False, value=set_value)
            else:
                param = parameter_class(name=param_name, disabled=True, value=0.0)
            params.append(param)
        return params

    def watchpoint_hit_on_no_value(self, iteration):
        """
        Returns list of WatchpointHit if tensors' npy files are missing,
        when error_on_no_value =True
        """
        no_value_hit_list = []
        node_map = self.sorted_tensors.get(iteration)
        if not node_map:
            return no_value_hit_list
        for node_info in node_map.values():
            for tensor in node_info.get('slot_map', {}).values():
                if tensor.has_value() is False:
                    hit_params = []
                    hit_detail = HitDetail(hit_params, self.condition)
                    # 32 means there is no value found
                    error_no_value_code = 32
                    no_value_hit = WatchpointHitImpl(tensor=tensor,
                                                     condition=self.condition,
                                                     hit_detail=hit_detail,
                                                     error_code=error_no_value_code)
                    no_value_hit_list.append(no_value_hit)
        return no_value_hit_list


class WatchpointConditionId(Enum):
    """Watchpoint condition ID."""
    OPERATOR_OVERFLOW = 2
    TENSOR_OVERFLOW = 13
    INITIAL_WEIGHT = 14
    TENSOR_TOO_LARGE = 15
    TENSOR_TOO_SMALL = 16
    TENSOR_ALL_ZERO = 17
    TENSOR_CHANGE_TOO_LARGE = 18
    TENSOR_CHANGE_TOO_SMALL = 19
    UNCHANGED_TENSOR = 20
    TENSOR_RANGE = 21


def validate_tensor_list(param, param_name):
    """Validate list."""
    if not isinstance(param, list):
        raise DebuggerParamTypeError(f"The type of {param_name} should be list of DebuggerTensor. "
                                     f"But the actual type is {type(param)}")
    for i, value in enumerate(param):
        if not isinstance(value, DebuggerTensor):
            raise DebuggerParamTypeError(f"The type of {param_name} should be list of DebuggerTensor. "
                                         f"But the {i} value is {type(value)}.")