Source code for mindformers.models.glm2.glm4_tokenizer

# Copyright 2024 Huawei Technologies Co., Ltd
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""ChatGLM4 Tokenizer."""
import base64
import os
from typing import List, Optional, Union, Dict
import tiktoken
import regex as re
from mindformers.mindformer_book import MindFormerBook
from mindformers.models.tokenization_utils import PreTrainedTokenizer, PaddingStrategy, EncodedInput, BatchEncoding
from mindformers.tools.register import MindFormerModuleType, MindFormerRegister
from mindformers.tools.utils import FILE_PERMISSION

__all__ = ['ChatGLM4Tokenizer']


[docs]@MindFormerRegister.register(MindFormerModuleType.TOKENIZER)
class ChatGLM4Tokenizer(PreTrainedTokenizer):
    """
    Construct a ChatGLM4 tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (str): The vocabulary file path.
        clean_up_tokenization_spaces (bool, optional): Whether to delete redundant spaces. Default: ``False``.
        encode_special_tokens (bool, optional): Whether to encode the special tokens. Default: ``False``.
        eos_token (Union[str, tokenizers.AddedToken], optional): The end of sequence token. Default: `"<|endoftext|>"` .
        pad_token (Union[str, tokenizers.AddedToken], optional): A special token used to make arrays of tokens the same
            size for batching purpose. Will then be ignored by attention mechanisms or loss computation.
            Default: `"<|endoftext|>"` .
        **kwargs: Other kwargs that will be passed into the base class of the `Tokenizer`.

    Returns:
        A ChatGLM4Tokenizer instance.

    Examples:
        >>> from mindformers import ChatGLM4Tokenizer
        >>> tokenizer = ChatGLM4Tokenizer('tokenizer.model')
        >>> prompts = ["晚上睡不着应该怎么办"]
        >>> token_id = tokenizer(prompts)
        >>> input_ids = token_id['input_ids']
        >>> print(input_ids)
        [[151331, 151333, 101160, 120410, 99379, 103298]]
        >>> response = tokenizer.decode(input_ids)
        >>> print(response)
        ['晚上睡不着应该怎么办']
    """
    vocab_files_names = {"vocab_file": "tokenizer.model"}
    model_input_names = ["input_ids", "attention_mask", "position_ids"]
    _support_list = MindFormerBook.get_tokenizer_support_list()['glm4']

    def __init__(
            self,
            vocab_file,
            clean_up_tokenization_spaces=False,
            encode_special_tokens=False,
            eos_token='<|endoftext|>',
            pad_token='<|endoftext|>',
            **kwargs
    ):
        self.name = "GLM4Tokenizer"
        self.vocab_file = vocab_file
        pat_str = ("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+"
                   "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+")
        self.pat_str = re.compile(pat_str)
        self.encode_special_tokens = encode_special_tokens

        mergeable_ranks = {}
        self.special_tokens = {"<|endoftext|>": 151329, "[MASK]": 151330, "[gMASK]": 151331, "[sMASK]": 151332,
                               "<sop>": 151333, "<eop>": 151334, "<|system|>": 151335, "<|user|>": 151336,
                               "<|assistant|>": 151337, "<|observation|>": 151338, "<|begin_of_image|>": 151339,
                               "<|end_of_image|>": 151340, "<|begin_of_video|>": 151341, "<|end_of_video|>": 151342}

        self._eos_token = eos_token
        self._pad_token = pad_token

        with open(vocab_file) as f:
            for line in f:
                token, rank = line.strip().split()
                rank = int(rank)
                token = base64.b64decode(token)
                mergeable_ranks[token] = rank

        self.mergeable_ranks = mergeable_ranks

        self.tokenizer = tiktoken.Encoding(
            name="my_tokenizer",
            pat_str=pat_str,
            mergeable_ranks=mergeable_ranks,
            special_tokens={}
        )

        self.decoder = {rank: token for token, rank in mergeable_ranks.items()}
        self.n_words = len(self.decoder)

        super().__init__(
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            eos_token=eos_token,
            pad_token=pad_token,
            **kwargs
        )
        for token in self.special_tokens:
            self.add_tokens(token, special_tokens=True)

    @property
    def vocab_size(self):
        return self.n_words

    def get_vocab(self):
        """ Returns vocab as a dict """
        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
        """
        Converts a sequence of tokens in a single string.
        """
        text = ""
        temp = b""
        for t in tokens:
            if isinstance(t, int):
                t = chr(t)
            if isinstance(t, str):
                if temp:
                    text += temp.decode("utf-8", errors="replace")
            elif isinstance(t, bytes):
                temp += t
            else:
                raise TypeError("token should only be of type int, bytes or str")
        if temp != "":
            text += temp.decode("utf-8", errors="replace")
        return text

    def _tokenize(self, text, **kwargs):
        tokens = []
        ids = self.tokenizer.encode(text)
        for t in ids:
            tokens.append(self.decoder[t])
        return tokens

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        return self.mergeable_ranks.get(token)

    def convert_special_tokens_to_ids(self, token):
        """ Converts special tokens to ids using the vocab. """
        try:
            return self.special_tokens.get(token)
        except ValueError as e:
            raise ValueError(f"{token} is not a special token for {self.name}") from e

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.decoder.get(index, "")

    def save_vocabulary(self, save_directory, filename_prefix=None):
        """
        Save the vocabulary and special tokens file to a directory.
        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.
        Returns:
            `Tuple(str)`, Paths to the files saved.
        """
        if os.path.isdir(save_directory):
            vocab_file = os.path.join(save_directory, self.vocab_files_names.get("vocab_file"))
        else:
            vocab_file = save_directory

        with open(self.vocab_file, 'rb') as fin:
            proto_str = fin.read()

        flags_ = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
        with os.fdopen(os.open(vocab_file, flags_, FILE_PERMISSION), 'wb') as writer:
            writer.write(proto_str)

        return (vocab_file,)

    def get_prefix_tokens(self):
        prefix_tokens = [self.convert_tokens_to_ids("[gMASK]"), self.convert_tokens_to_ids("<sop>")]
        return prefix_tokens

    def build_single_message(self, role, metadata, message, tokenize=True):
        """build single message with role."""
        if role not in ["system", "user", "assistant", "observation"]:
            raise ValueError(f'{role} not in ["system", "user", "assistant", "observation"]')
        if tokenize:
            role_tokens = [self.convert_tokens_to_ids(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n",
                                                                                              disallowed_special=())
            message_tokens = self.tokenizer.encode(message, disallowed_special=())
            tokens = role_tokens + message_tokens
            return tokens
        return str(f"<|{role}|>{metadata}\n{message}")

    def build_inputs_with_special_tokens(
            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:
        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`
        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`, list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        prefix_tokens = self.get_prefix_tokens()
        token_ids_0 = prefix_tokens + token_ids_0
        if token_ids_1 is not None:
            token_ids_0 = token_ids_0 + token_ids_1 + [self.convert_tokens_to_ids("<eos>")]
        return token_ids_0

    def _pad(
            self,
            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
            max_length: Optional[int] = None,
            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
            pad_to_multiple_of: Optional[int] = None,
            return_attention_mask: Optional[bool] = None,
    ) -> dict:
        """
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                `>= 7.5` (Volta).
            return_attention_mask:
                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
        """
        # Load from model defaults
        required_input = encoded_inputs[self.model_input_names[0]]
        seq_length = len(required_input)

        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

        # Initialize attention mask if not present.
        if "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * seq_length

        if "position_ids" not in encoded_inputs:
            encoded_inputs["position_ids"] = list(range(seq_length))

        if needs_to_be_padded:
            difference = max_length - len(required_input)

            if "attention_mask" in encoded_inputs:
                encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
            if "position_ids" in encoded_inputs:
                encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
            encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference

        return encoded_inputs