# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file was refer to project:
# https://github.com/huawei-noah/Pretrained-Language-Model/blob/master/PanGu-%CE%B1/utils.py
# https://github.com/huggingface/transformers/blob/main/src/transformers/optimization.py
# ============================================================================
"""Self-Define LR Schedule."""
import math
from mindspore._checkparam import args_type_check
from mindspore.ops import operations as P
import mindspore.common.dtype as mstype
from mindspore.nn.learning_rate_schedule import LearningRateSchedule
from mindspore.common.tensor import Tensor
from mindformers.tools.register import MindFormerRegister, MindFormerModuleType
from mindformers.tools.logger import logger
__all__ = [
'LearningRateWiseLayer', 'ConstantWarmUpLR', 'ConstantWithCoolDownLR', 'LinearWithWarmUpLR', 'CosineWithWarmUpLR',
'CosineWithRestartsAndWarmUpLR', 'PolynomialWithWarmUpLR', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts',
'WarmUpStableDecayLR']
def _get_lr_steps(steps: int, ratio: float, total_steps: int, phase_tag: str):
"""check args and get specified steps."""
if ratio is None:
if not isinstance(steps, int):
raise TypeError(f"The type of {phase_tag}_step must be int, but got {type(steps)}")
if steps < 0:
raise ValueError(f"The {phase_tag}_step must be >= 0, but got {steps}")
return steps
if not isinstance(ratio, (float, int)):
raise TypeError(f"The type of {phase_tag}_ratio must be float or int, but got {type(ratio)}")
if ratio > 1.0 or ratio < 0.0:
raise ValueError(f"The {phase_tag}_ratio's value range must be in [0,1], but got {ratio}")
_validate_total_steps(total_steps)
steps = int(total_steps * ratio)
logger.info(f"Current {phase_tag}_ratio is %s, total_steps is %s, {phase_tag}_steps will be set to %s",
ratio, total_steps, steps)
return steps
def _validate_total_steps(total_steps):
"""Validate total step"""
if total_steps is None:
raise ValueError(f"When warmup_ratio/decay_ratio takes effect, total_steps must be set, but got {total_steps}")
if not isinstance(total_steps, int):
raise TypeError(f"The type of total_steps must be int, but got {type(total_steps)}")
def _check_decay_method(decay_steps: int, total_steps: int):
"""check decay method."""
if decay_steps is not None:
return
if decay_steps is None and total_steps is None:
raise ValueError(f"When decay_steps is None, total_steps must be set, but got {total_steps} ")
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class ConstantWithCoolDownLR(LearningRateSchedule):
r"""
Constant Learning Rate with Cooldown.
Implement as described in the paper `DeepSeek-V3 Technical Report <https://arxiv.org/pdf/2412.19437>`_, page 23.
The ConstantWithCoolDownLR uses a linear warm-up strategy to gradually increase the learning rate
for each parameter group, and keep stable for some steps, after which it follows a cosine function to decay.
Finally, it will switch to a new constant learning rate after another steps.
During the warm-up phase, the learning rate increases linearly from a smaller initial value to the base
learning rate, as described by the following formula:
.. math::
\eta_t = \eta_{\text{warmup}} + t \times \frac{\eta_{\text{base}} - \eta_{\text{warmup}}}{\text{warmup_steps}}
where :math:`\eta_{\text{warmup}}` is the initial learning rate during the warm-up phase,
and :math:`\eta_{\text{base}}` is the base learning rate after the warm-up phase.
During the decay phase, the learning rate follows a cosine decay schedule:
.. math::
\eta_t = \eta_{\text{end}} + \frac{1}{2}(\eta_{\text{base}} - \eta_{\text{end}})\left(1
+ \cos\left(\frac{t_{cur}}{t_{max}}\pi\right)\right)
where :math:`t_{cur}` is the number of steps since the beginning of the decay phase, and :math:`t_{max}` is
the total number of decay steps.
Args:
learning_rate (float): Learning rate after the warm-up phase.
This learning rate will be unchanged in keep phase.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
keep_steps (int, optional): The number of steps keeping at the max learning rate after warmup. Default: ``0``.
decay_steps (int, optional): The number of decay steps. Default: ``None``.
decay_ratio (float, optional): Ratio of total training steps used for decay. Default: ``None``.
total_steps (int, optional): The number of total steps. Default: ``None``.
num_cycles (float, optional): The number of waves in the cosine schedule (the defaults is to just
decrease from the max value to 0 following a half-cosine). Default: ``0.5``.
lr_end1 (float, optional): The value of learning rate after decay. Default: ``0.``.
final_steps (int, optional): The number of steps keeping at `lr_end1`. Default: ``0``.
lr_end2 (float, optional): Final value of learning rate.
The same as `lr_end1` if set ``None``. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core.lr import ConstantWithCoolDownLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> warmup_steps = 10
>>> keep_steps = 10
>>> decay_steps = 10
>>> final_steps = 10
>>> learning_rate = 0.005
>>>
>>> linear_warmup = ConstantWithCoolDownLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... keep_steps=keep_steps,
... decay_steps=decay_steps,
... final_steps=final_steps,
... lr_end1=0.002, lr_end2=0.001)
>>> print(linear_warmup(ms.Tensor(1)))
0.0005
>>> print(linear_warmup(ms.Tensor(15)))
0.005
>>> print(linear_warmup(ms.Tensor(25)))
0.0035
>>> print(linear_warmup(ms.Tensor(35)))
0.002
>>> print(linear_warmup(ms.Tensor(45)))
0.001
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
keep_steps=int, decay_steps=int, decay_ratio=float, total_steps=int,
num_cycles=(int, float), lr_end1=(int, float), final_steps=int, lr_end2=(int, float)
)
def __init__(
self,
learning_rate: float,
warmup_steps: int = None,
warmup_lr_init: float = 0.,
warmup_ratio: float = None,
keep_steps: int = 0,
decay_steps: int = None,
decay_ratio: float = None,
total_steps: int = None,
num_cycles: float = 0.5,
lr_end1: float = 0,
final_steps: int = 0,
lr_end2: float = None,
**kwargs
):
super(ConstantWithCoolDownLR, self).__init__()
warmup_steps_ = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
decay_steps = max(1, decay_steps) if decay_steps is not None else max(1, total_steps)
decay_steps_ = _get_lr_steps(decay_steps, decay_ratio, total_steps, "decay")
keep_steps_ = max(0, keep_steps) if keep_steps is not None else 0
final_steps_ = max(0, final_steps) if final_steps is not None else 0
self.kwargs = kwargs
self.warmup_steps = Tensor(warmup_steps_, mstype.float32)
self.decay_steps = Tensor(decay_steps_, mstype.float32)
self.keep_steps = Tensor(keep_steps_, mstype.float32)
self.final_steps = Tensor(final_steps_, mstype.float32)
self.learning_rate = learning_rate
self.warmup_lr_init = warmup_lr_init
self.lr_end1 = lr_end1
self.lr_end2 = lr_end2 or lr_end1
self.num_cycles = num_cycles
self.greater = P.Greater()
self.greater_equal = P.GreaterEqual()
self.max = P.Maximum()
self.math_pi = math.pi
self.cos = P.Cos()
self.zero_constant = Tensor(0.0, mstype.float32)
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
elif self.greater(self.warmup_steps + self.keep_steps, global_step):
learning_rate = global_step - global_step + self.learning_rate
elif self.greater(self.warmup_steps + self.keep_steps + self.decay_steps, global_step):
progress = (global_step - self.keep_steps - self.warmup_steps) / self.decay_steps
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * self.num_cycles * 2.0 * progress)))
learning_rate = self.lr_end1 + (self.learning_rate - self.lr_end1) * percent
elif self.greater(self.warmup_steps + self.keep_steps + self.decay_steps + self.final_steps, global_step):
learning_rate = global_step - global_step + self.lr_end1
else:
learning_rate = global_step - global_step + self.lr_end2
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class ConstantWarmUpLR(LearningRateSchedule):
r"""
Constant Warm Up Learning Rate.
This learning rate strategy maintains a constant learning rate during the warm-up phase.
It is particularly suitable for scenarios where a stable, lower learning rate is needed at the
beginning of training to avoid issues such as gradient explosion, before transitioning to
the main learning rate schedule.
During the warm-up phase, the learning rate is kept at a fixed value, denoted as :math:`\eta_{\text{warmup}}` .
The formula for the learning rate during the warm-up phase is:
.. math::
\eta_t = \eta_{\text{warmup}}
Here, :math:`\eta_{\text{warmup}}` is the fixed learning rate applied during the warm-up steps,
and :math:`t` represents the current step.
After the warm-up phase concludes, the learning rate transitions to the main learning rate,
denoted as :math:`\eta_{\text{main}}` . The formula for the learning rate after the transition is:
.. math::
\eta_t = \eta_{\text{main}}
Args:
learning_rate (float): Learning rate after the warm-up phase.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
total_steps (int, optional): The number of warm up steps. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import ConstantWarmUpLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>>
>>> constant_warmup = ConstantWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps)
>>> print(constant_warmup(ms.Tensor(1)))
0.0005
>>> print(constant_warmup(ms.Tensor(15)))
0.005
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int
)
def __init__(self, learning_rate: float, warmup_steps: int = None, warmup_lr_init: float = 0.,
warmup_ratio: float = None, total_steps: int = None, **kwargs):
super(ConstantWarmUpLR, self).__init__()
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
self.learning_rate = learning_rate
self.warmup_lr_init = warmup_lr_init
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.one_constant = Tensor(1.0, mstype.float32)
self.greater = P.Greater()
self.kwargs = kwargs
def construct(self, global_step):
"""compute current step lr."""
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
else:
percent = self.one_constant
learning_rate = self.learning_rate * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class LinearWithWarmUpLR(LearningRateSchedule):
r"""
Linear with Warm Up Learning Rate.
The LinearWithWarmUpLR scheduler uses a linear warm-up strategy to gradually increase the learning rate
for each parameter group, followed by a linear adjustment of the learning rate after the warm-up phase ends.
During the warm-up phase, the learning rate increases linearly from a smaller initial value to the base
learning rate, as described by the following formula:
.. math::
\eta_t = \eta_{\text{warmup}} + t \times \frac{\eta_{\text{base}} - \eta_{\text{warmup}}}{\text{warmup_steps}}
where :math:`\eta_{\text{warmup}}` is the initial learning rate during the warm-up phase,
and :math:`\eta_{\text{base}}` is the base learning rate after the warm-up phase.
After the warm-up phase, the learning rate is adjusted according to the following linear schedule:
.. math::
\eta_t = \eta_{\text{base}} - t \times \frac{\eta_{\text{base}} - \eta_{\text{end}}}{\text{total_steps}
- \text{warmup_steps}}
where :math:`\eta_{\text{end}}` is the minimum learning rate at the end of training, :math:`\text{total_steps}`
is the total number of training steps, and :math:`\text{warmup_steps}` is the number of steps in the warm-up phase.
This method allows for a smooth increase in learning rate through linear warm-up, followed by a gradual
decrease during the remainder of the training, enhancing the stability and effectiveness of the training process.
Args:
learning_rate (float): Learning rate after the warm-up phase.
total_steps (int): The number of total steps.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import LinearWithWarmUpLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>>
>>> linear_warmup = LinearWithWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps)
>>> print(linear_warmup(ms.Tensor(1)))
0.0005
>>> print(linear_warmup(ms.Tensor(15)))
0.0025
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int
)
def __init__(self, learning_rate: float, total_steps: int, warmup_steps: int = None,
warmup_lr_init: float = 0., warmup_ratio: float = None,
**kwargs):
super(LinearWithWarmUpLR, self).__init__()
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
linear_steps = max(1, total_steps - warmup_steps)
self.kwargs = kwargs
self.learning_rate = learning_rate
self.warmup_lr_init = warmup_lr_init
self.total_steps = Tensor(total_steps, mstype.float32)
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.linear_steps = Tensor(linear_steps, mstype.float32)
self.greater = P.Greater()
self.max = P.Maximum()
self.zero_constant = Tensor(0.0, mstype.float32)
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
else:
percent = self.max(self.zero_constant, (self.total_steps - global_step) / self.linear_steps)
learning_rate = self.learning_rate * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class CosineWithWarmUpLR(LearningRateSchedule):
r"""
Cosine with Warm Up Learning Rate.
The CosineWithWarmUpLR learning rate scheduler applies a cosine annealing schedule with warm-up steps to
set the learning rate for each parameter group. Initially, the learning rate increases linearly during
the warm-up phase, after which it follows a cosine function to decay.
During the warm-up phase, the learning rate increases from a small initial value to the base
learning rate as follows:
.. math::
\eta_t = \eta_{\text{warmup}} + t \times \frac{\eta_{\text{base}} - \eta_{\text{warmup}}}{\text{warmup_steps}}
where :math:`\eta_{\text{warmup}}` is the initial learning rate, and :math:`\eta_{\text{base}}` is the
learning rate after the warm-up phase.
once the warm-up phase is completed, the learning rate follows a cosine decay schedule:
.. math::
\eta_t = \eta_{\text{end}} + \frac{1}{2}(\eta_{\text{base}} - \eta_{\text{end}})\left(1
+ \cos\left(\frac{t_{cur}}{t_{max}}\pi\right)\right)
where :math:`t_{cur}` is the number of epochs since the end of the warm-up phase, and :math:`t_{max}` is
the total number of epochs until the next restart.
Args:
learning_rate (float): Learning rate after the warm-up phase.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
total_steps (int, optional): The number of total steps. Default: ``None``.
num_cycles (float, optional): The number of waves in the cosine schedule (the defaults is to just
decrease from the max value to 0 following a half-cosine). Default: ``0.5``.
lr_end (float, optional): Final value of learning rate. Default: ``0.``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
decay_steps (int, optional): The number of decay steps. Default: ``None``.
decay_ratio (float, optional): Ratio of total training steps used for decay. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import CosineWithWarmUpLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>>
>>> cosine_warmup = CosineWithWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps)
>>> print(cosine_warmup(ms.Tensor(1)))
0.0005
>>> print(cosine_warmup(ms.Tensor(15)))
0.0024999997
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int, num_cycles=(int, float), lr_end=(int, float)
)
def __init__(self, learning_rate: float, warmup_steps: int = 0, total_steps: int = None,
num_cycles: float = 0.5, lr_end: float = 0., warmup_lr_init: float = 0.,
warmup_ratio: float = None, decay_steps: int = None, decay_ratio: float = None, **kwargs):
super(CosineWithWarmUpLR, self).__init__()
_check_decay_method(decay_steps, total_steps)
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
cosine_steps = max(1, total_steps - warmup_steps)
decay_steps = max(1, decay_steps) if decay_steps is not None else max(1, total_steps)
decay_steps_ = _get_lr_steps(decay_steps, decay_ratio, total_steps, "decay")
self.kwargs = kwargs
self.learning_rate = learning_rate
self.lr_end = Tensor(lr_end, mstype.float32)
self.warmup_lr_init = warmup_lr_init
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.cosine_steps = Tensor(cosine_steps, mstype.float32)
self.decay_steps = Tensor(decay_steps_, mstype.float32)
self.num_cycles = num_cycles
self.greater = P.Greater()
self.greater_equal = P.GreaterEqual()
self.max = P.Maximum()
self.math_pi = math.pi
self.cos = P.Cos()
self.zero_constant = Tensor(0.0, mstype.float32)
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if self.greater_equal(global_step, self.decay_steps):
# Include global_step in computation to circumvent mindspore control flow issues
return global_step - global_step + self.lr_end
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
else:
progress = (global_step - self.warmup_steps) / self.cosine_steps
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * self.num_cycles * 2.0 * progress)))
learning_rate = self.lr_end + (self.learning_rate - self.lr_end) * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class CosineWithRestartsAndWarmUpLR(LearningRateSchedule):
r"""
Cosine with Restarts and Warm Up Learning Rate.
The CosineWithRestartsAndWarmUpLR schedule sets the learning rate for each parameter group using a cosine
annealing with restarts and warm-up, where :math:`\eta_{max}` is set to the initial learning rate,
and :math:`T_{cur}` represents the number of steps since the last restart:
.. math::
\begin{aligned}
\eta_t & = \eta_{\text{min}} + \frac{1}{2}(\eta_{\text{max}} - \eta_{\text{min}})\left(1
+ \cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right), & T_{cur} \neq (2k+1)T_{i}; \ \eta_{t+1}
& = \eta_{\text{max}}, & T_{cur} = (2k+1)T_{i}.
\end{aligned}
When last_epoch=-1, the initial learning rate is set to lr. During the restart phase, the learning rate begins
anew from the maximum value and gradually decreases to the set minimum value. This strategy helps avoid getting
trapped in local minima and accelerates convergence during training.
This method was proposed in SGDR: Stochastic Gradient Descent with Warm Restarts, extending the concept of
cosine annealing to allow for multiple restarts.
Args:
learning_rate (float): Learning rate after the warm-up phase.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
total_steps (int, optional): The number of total steps. Default: ``None``.
num_cycles (float, optional): The number of waves in the cosine schedule (the defaults is to just decrease
from the max value to 0 following a half-cosine). Default: ``1.0``.
lr_end (float, optional): Final value of learning rate. Default: ``0.``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
decay_steps (int, optional): The number of decay steps. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import CosineWithRestartsAndWarmUpLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>>
>>> cosine_warmup_restart = CosineWithRestartsAndWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps)
>>> print(cosine_warmup_restart(ms.Tensor(1)))
0.0005
>>> print(cosine_warmup_restart(ms.Tensor(15)))
0.0024999997
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int, num_cycles=(int, float), lr_end=(int, float)
)
def __init__(self, learning_rate: float, warmup_steps: int = None, total_steps: int = None,
num_cycles: float = 1., lr_end: float = 0., warmup_lr_init: float = 0.,
warmup_ratio: float = None, decay_steps: int = None, **kwargs):
super(CosineWithRestartsAndWarmUpLR, self).__init__()
_check_decay_method(decay_steps, total_steps)
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
cosine_steps = max(1, total_steps - warmup_steps)
decay_steps = max(1, decay_steps) \
if decay_steps is not None else max(1, total_steps)
self.kwargs = kwargs
self.learning_rate = learning_rate
self.lr_end = Tensor(lr_end, mstype.float32)
self.warmup_lr_init = warmup_lr_init
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.cosine_steps = Tensor(cosine_steps, mstype.float32)
self.decay_steps = Tensor(decay_steps, mstype.float32)
self.num_cycles = num_cycles
self.greater = P.Greater()
self.greater_equal = P.GreaterEqual()
self.max = P.Maximum()
self.math_pi = math.pi
self.cos = P.Cos()
self.zero_constant = Tensor(0.0, mstype.float32)
self.one_constant = Tensor(1.0, mstype.float32)
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if self.greater_equal(global_step, self.decay_steps):
# Include global_step in computation to circumvent mindspore control flow issues
return global_step - global_step + self.lr_end
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
else:
progress = (global_step - self.warmup_steps) / self.cosine_steps
if self.greater(self.one_constant, progress):
percent = self.max(
self.zero_constant,
0.5 * (1.0 + self.cos(self.math_pi * ((self.num_cycles * progress) % self.one_constant))))
learning_rate = self.lr_end + (self.learning_rate - self.lr_end) * percent
else:
return self.zero_constant
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class PolynomialWithWarmUpLR(LearningRateSchedule):
r"""
Polynomial with Warm Up Learning Rate.
At the beginning of training, the learning rate gradually increases from a lower initial
value, :math:`\eta_{\text{warmup}}` , to the starting learning rate, :math:`\eta_{\text{start}}` .
The change in learning rate during the warm-up phase, depending on the step :math:`t` ,
is described by the following formula:
.. math::
\eta_t = \eta_{\text{warmup}} + t \times \frac{\eta_{\text{start}} - \eta_{\text{warmup}}}{\text{warmup_steps}}
where :math:`\text{warmup_steps}` represents the total number of steps in the warm-up phase.
After the warm-up phase concludes, the learning rate gradually decays according to a polynomial function,
reaching the final learning rate, :math:`\eta_{\text{end}}` . The change in learning rate over the total
number of steps :math:`\text{total_steps}` is given by the formula:
.. math::
\eta_t = \eta_{\text{end}} + (\eta_{\text{start}} - \eta_{\text{end}}) \times \left(1
- \frac{t - \text{warmup_steps}}{\text{decay_steps}}\right)^{\text{power}}
where :math:`\text{power}` is the exponent of the polynomial, controlling the decay rate.
This learning rate strategy is well-suited for scenarios where a stable learning rate is needed during
the early stages of training, with a gradual decrease in the later stages. By preventing gradient explosion
initially and reducing the learning rate during the latter part of training, it helps the model achieve better
generalization as it converges.
Args:
learning_rate (float): Learning rate after the warm-up phase.
total_steps (int): The number of total steps.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
lr_end (float, optional): Final value of learning rate. Default: ``1e-7``.
power (float, optional): The power of the polynomial. Default: ``1.0``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
decay_steps (int, optional): The number of decay steps, which must be smaller than total_steps - warmup_steps.
If the value is None, decay steps will be total_steps - warmup_steps. Default: ``None``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import PolynomialWithWarmUpLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>> lr_end = 0.0000001
>>>
>>> polynomial_warmup = PolynomialWithWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps,
... lr_end=lr_end)
>>> print(polynomial_warmup(ms.Tensor(1)))
0.0005
>>> print(polynomial_warmup(ms.Tensor(15)))
0.0025000498
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int, lr_end=(int, float), power=(int, float)
)
def __init__(self, learning_rate: float, total_steps: int, warmup_steps: int = None,
lr_end: float = 1e-7, power: float = 1.0, warmup_lr_init: float = 0.,
warmup_ratio: float = None, decay_steps: int = None, **kwargs):
super(PolynomialWithWarmUpLR, self).__init__()
_check_decay_method(decay_steps, total_steps)
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
decay_steps = max(1, decay_steps) \
if decay_steps is not None else max(1, total_steps - warmup_steps)
if not learning_rate > lr_end:
raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({learning_rate})")
self.kwargs = kwargs
self.learning_rate = learning_rate
self.warmup_lr_init = warmup_lr_init
self.lr_end = Tensor(lr_end, mstype.float32)
self.power = power
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.decay_steps = Tensor(decay_steps, mstype.float32)
self.total_steps = Tensor(total_steps, mstype.float32)
self.greater = P.Greater()
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
elif self.greater(global_step, self.total_steps):
# Include global_step in computation to circumvent mindspore control flow issues
return global_step - global_step + self.lr_end
else:
lr_range = self.learning_rate - self.lr_end
pct_remaining = 1 - (global_step - self.warmup_steps) / self.decay_steps
decay = lr_range * pct_remaining ** self.power + self.lr_end
percent = decay / self.learning_rate
learning_rate = self.learning_rate * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class LearningRateWiseLayer(LearningRateSchedule):
r"""
Learning Rate Wise Layer.
This approach allows each layer to adapt its learning rate according to its specific needs, leading
to more efficient and effective training. The learning rate for each layer is determined by a base
learning rate modulated by a scaling factor specific to that layer.
Initially, the learning rate for each layer is set based on a linear scaling strategy:
.. math::
\eta_{t,l} = \eta_{\text{base}} \times \alpha_l
where :math:`\eta_{t,l}` is the learning rate for layer :math:`l` at time :math:`t` , :math:`\eta_{\text{base}}`
is the base learning rate, and :math:`\alpha_l` is the scaling factor for layer :math:`l` .
As training progresses, the learning rate for each layer is adjusted according to the
following cosine annealing schedule:
.. math::
\eta_{t,l} = \eta_{\text{end}} + \frac{1}{2}(\eta_{\text{base}} \times \alpha_l - \eta_{\text{end}})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
where :math:`T_{cur}` is the number of epochs completed since the learning rate was last reset,
and :math:`T_{max}` is the total number of epochs before the next reset. :math:`\eta_{\text{end}}` represents
the minimum learning rate at the end of the training.
Args:
base_lr (mindspore.nn.LearningRateSchedule): The base learning rate schedule.
lr_scale (float): The value for learning rate scaling.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import LinearWithWarmUpLR
>>> from mindformers.core import LearningRateWiseLayer
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> total_steps = 20
>>> warmup_steps = 10
>>> learning_rate = 0.005
>>>
>>> linear_warmup = LinearWithWarmUpLR(learning_rate=learning_rate,
... warmup_steps=warmup_steps,
... total_steps=total_steps)
>>> learning_rate_wise_layer = LearningRateWiseLayer(linear_warmup, 0.5)
>>> print(learning_rate_wise_layer(ms.Tensor(1)))
0.00025
>>> print(learning_rate_wise_layer(ms.Tensor(15)))
0.00125
"""
def __init__(self, base_lr, lr_scale):
super(LearningRateWiseLayer, self).__init__()
self.base_lr = base_lr
self.lr_scale = lr_scale
def construct(self, global_step):
lr = self.base_lr(global_step)
return self.lr_scale * lr
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class WarmUpStableDecayLR(LearningRateSchedule):
r"""
Warm Up Stable Decay Learning Rate.
This learning rate scheduler consists of three phases:
1. **Warm-up Phase**: The learning rate increases linearly from the initial value `warmup_lr_init` to the base
learning rate `learning_rate`.
2. **Steady Phase**: The learning rate remains constant at the base value.
3. **Decay Phase**: The learning rate decreases linearly from `learning_rate` to the final value `lr_end`.
**Warm-up Phase Formula**:
.. math::
\eta_t = \eta_{\text{warmup}} + t \times \frac{\eta_{\text{base}} -
\eta_{\text{warmup}}}{\text{warmup_steps}}
Where:
* :math:`\eta_{\text{warmup}}` is the initial warm-up learning rate (`warmup_lr_init`)
* :math:`\eta_{\text{base}}` is the base learning rate (`learning_rate`)
* :math:`t` is the current step (not exceeding `warmup_steps`)
**Decay Phase Formula**:
.. math::
\eta_t = \eta_{\text{base}} - (\eta_{\text{base}} - \eta_{\text{end}}) \times
\frac{t - T_{\text{decay_start}}}{T_{\text{decay_steps}}}
Where:
* :math:`\eta_{\text{end}}` is the final learning rate (`lr_end`)
* :math:`T_{\text{decay_start}}` is the step at which decay begins (`decay_start_steps`)
* :math:`T_{\text{decay_steps}}` is the total number of decay steps (`total_steps - decay_start_steps`)
Args:
learning_rate (float): Learning rate after the warm-up phase.
lr_end (float, optional): Final value of learning rate. Default: ``1e-7``.
warmup_steps (int, optional): The number of warm up steps. Default: ``None``.
warmup_lr_init (float, optional): Initial learning rate in warm up steps. Default: ``0.``.
warmup_ratio (float, optional): Ratio of total training steps used for warmup. Default: ``None``.
total_steps (int, optional): The number of total steps. Default: ``None``.
decay_start_steps (int, optional): The start step of decay. Default: ``None``.
decay_start_ratio (float, optional): Ratio of total training steps used for decay. Default: ``None``.
Inputs:
- **global_step** (int) - The global step.
Outputs:
Learning rate.
Raises:
ValueError: If `lr_end` is greater than or equal to initial `learning_rate`.
"""
@args_type_check(
learning_rate=(int, float), warmup_steps=int, warmup_lr_init=(int, float), warmup_ratio=(int, float),
total_steps=int
)
def __init__(self, learning_rate: float, lr_end: float = 1e-7, warmup_steps: int = None, warmup_lr_init: float = 0.,
warmup_ratio: float = None, total_steps: int = None, decay_start_steps: int = None,
decay_start_ratio: float = None, **kwargs):
super().__init__()
warmup_steps = _get_lr_steps(warmup_steps, warmup_ratio, total_steps, "warmup")
decay_start_steps = _get_lr_steps(decay_start_steps, decay_start_ratio, total_steps, "decay_start")
self.learning_rate = learning_rate
if not learning_rate > lr_end:
raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({learning_rate})")
self.warmup_lr_init = warmup_lr_init
self.decay_start_steps = Tensor(decay_start_steps, mstype.float32)
self.warmup_steps = Tensor(warmup_steps, mstype.float32)
self.decay_steps = total_steps - self.decay_start_steps
self.learning_rate = learning_rate
self.lr_end = Tensor(lr_end, mstype.float32)
self.one_constant = Tensor(1.0, mstype.float32)
self.greater = P.Greater()
self.kwargs = kwargs
def construct(self, global_step):
"""compute current step lr."""
if self.warmup_steps != 0 and self.greater(self.warmup_steps, global_step):
percent = global_step / self.warmup_steps
learning_rate = self.warmup_lr_init + (self.learning_rate - self.warmup_lr_init) * percent
elif self.greater(self.decay_start_steps, global_step):
learning_rate = self.learning_rate * global_step / global_step
else:
percent = (global_step - self.decay_start_steps) / self.decay_steps
learning_rate = self.learning_rate - (self.learning_rate - self.lr_end) * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class CosineAnnealingLR(LearningRateSchedule):
r"""
It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_ .
Note that this only implements the cosine annealing part of SGDR, and not the restarts.
Set the learning rate of each parameter group using a cosine annealing
schedule, where :math:`\eta_{max}` is set to the initial lr and
:math:`T_{cur}` is the number of epochs since the last restart in SGDR:
.. math::
\begin{aligned}
\eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+ \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
& T_{cur} \neq (2k+1)T_{max}; \\
\eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
\left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
& T_{cur} = (2k+1)T_{max}.
\end{aligned}
When last_epoch=-1, sets initial lr as lr. Notice that because the schedule
is defined recursively, the learning rate can be simultaneously modified
outside this scheduler by other operators. If the learning rate is set
solely by this scheduler, the learning rate at each step becomes:
.. math::
\eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
\cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
Args:
base_lr (float): Maximum learning rate.
t_max (int): Maximum number of iterations.
eta_min (float, optional): Minimum learning rate. Default: ``0.``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import CosineAnnealingLR
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> base_lr = 0.005
>>> t_max = 10
>>> eta_min = 0.0000001
>>>
>>> cosine_annealing = CosineAnnealingLR(base_lr=base_lr, t_max=t_max, eta_min=eta_min)
>>> print(cosine_annealing(ms.Tensor(1)))
0.0048776437
>>> print(cosine_annealing(ms.Tensor(15)))
0.0025000498
"""
@args_type_check(base_lr=(int, float), t_max=int, eta_min=(int, float))
def __init__(self, base_lr: float, t_max: int, eta_min: float = 0., **kwargs):
super(CosineAnnealingLR, self).__init__()
if t_max < 1 or not isinstance(t_max, int):
raise ValueError(f"Expected positive integer T_max, but got {t_max}")
self.kwargs = kwargs
self.base_lr = base_lr
self.t_max = t_max
self.eta_min = eta_min
self.math_pi = math.pi
self.cos = P.Cos()
self.max = P.Maximum()
self.zero_constant = Tensor(0.0, mstype.float32)
self.cast = P.Cast()
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * global_step / self.t_max)))
learning_rate = self.eta_min + (self.base_lr - self.eta_min) * percent
return learning_rate
[docs]@MindFormerRegister.register(MindFormerModuleType.LR)
class CosineAnnealingWarmRestarts(LearningRateSchedule):
r"""Set the learning rate of each parameter group using a cosine annealing
schedule, where :math:`\eta_{max}` is set to the initial lr, :math:`T_{cur}`
is the number of epochs since the last restart and :math:`T_{i}` is the number
of epochs between two warm restarts in SGDR:
.. math::
\eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
\cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right)
When :math:`T_{cur}=T_{i}`, set :math:`\eta_t = \eta_{min}`.
When :math:`T_{cur}=0` after restart, set :math:`\eta_t=\eta_{max}`.
It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_ .
Args:
base_lr (float): Maximum learning rate.
t_0 (int): Number of iterations for the first restart.
t_mult (int, optional): A factor increases :math:`T_{i}` after a restart. Default: ``1``.
eta_min (float, optional): Minimum learning rate. Default: ``0.``.
Inputs:
- **global_step** (Tensor) - The global step.
Outputs:
Learning rate.
Examples:
>>> import mindspore as ms
>>> from mindformers.core import CosineAnnealingWarmRestarts
>>>
>>> ms.set_context(mode=ms.GRAPH_MODE)
>>> base_lr = 0.005
>>> t_0 = 10
>>> t_mult = 2
>>> eta_min = 0.0000001
>>>
>>> cosine_annealing_restart = CosineAnnealingWarmRestarts(base_lr=base_lr,
... t_0=t_0,
... t_mult=t_mult,
... eta_min=eta_min)
>>> print(cosine_annealing_restart(ms.Tensor(1)))
0.0048776437
>>> print(cosine_annealing_restart(ms.Tensor(15)))
0.0042677815
"""
@args_type_check(base_lr=(int, float), t_0=int, t_mult=int, eta_min=(int, float))
def __init__(self, base_lr: float, t_0: int, t_mult: int = 1, eta_min: float = 0., **kwargs):
super(CosineAnnealingWarmRestarts, self).__init__()
if t_0 < 1 or not isinstance(t_0, int):
raise ValueError(f"Expected positive integer t_0, but got {t_0}")
if t_mult < 1 or not isinstance(t_mult, int):
raise ValueError(f"Expected positive integer t_mult, but got {t_mult}")
self.kwargs = kwargs
self.base_lr = base_lr
self.t_0 = t_0
self.t_mult = t_mult
self.eta_min = eta_min
self.math_pi = math.pi
self.cos = P.Cos()
self.max = P.Maximum()
self.zero_constant = Tensor(0.0, mstype.float32)
self.cast = P.Cast()
self.floor = P.Floor()
self.log = P.Log()
self.log_t_mult = math.log(t_mult)
def construct(self, global_step):
"""compute current step lr."""
global_step = self.cast(global_step, mstype.float32)
if global_step < self.t_0:
t_cur = global_step
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * t_cur / self.t_0)))
elif self.t_mult == 1:
t_index = global_step // self.t_0
t_cur = global_step - t_index * self.t_0
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * t_cur / self.t_0)))
else:
t_index = self.floor(self.log(global_step / self.t_0 * (self.t_mult - 1.0) + 1.0) / self.log_t_mult)
q_n = self.t_mult ** t_index
t_start = self.t_0 * (1.0 - q_n) / (1.0 - self.t_mult)
t_i = self.t_0 * q_n
t_cur = global_step - t_start
percent = self.max(
self.zero_constant, 0.5 * (1.0 + self.cos(self.math_pi * t_cur / t_i)))
learning_rate = self.eta_min + (self.base_lr - self.eta_min) * percent
return learning_rate