File size: 14,344 Bytes

01f4d5b

import torch

import types
from typing import Any, Dict, List, Optional, Tuple, Union
import transformers
from transformers import Cache, GenerationConfig
import torch.nn as nn
from transformers.modeling_utils import  PreTrainedModel

from .functions_2_patch import _validate_model_kwargs, llama_atten_forward
from .monkey_patching_utils import monkey_patching
from .sep_cache_utils import SepCache


UNSUPPORTED_GENERATION_ARGS = [
    "cache_implementation",  # cache-related arguments, here we always use SepCache
    "cache_config",
    "return_legacy_cache",
    "num_beams",  # beam search (and cousin techniques) are not supported
    "compile_config",  # SepCache doesn't support torch.compile
    "assistant_model",  # it also doesn't support speculative decoding
]


def generate(model,                          
            ## For SepCache                              
            init_cache_size: Union[int, List] = 4,        
            sep_cache_size: Union[int, List] = 128,
            local_size: Union[int, List]=256, 
            cache_size: Union[int, List]=512,    
            SEP_ACCUMULATION: bool = True,
            USE_MAX_SEP_CACHE: bool = False,
            SEP_PADDING_IN_BATCH: bool = False,
            separator_token_ids: List[int] = None, ## required for initialization if `model_type` is not provided.
            PADDING_ID: int = None, ## required for initialization if `model_type` is not provided.

            ## For inheritance & initialization states
            past_tok_ids: List[torch.Tensor] = None,  ## It saves all the token ids corresponding to the saved KVs for all layers in SepCache.                
            key_cache: List[torch.Tensor] = None,          
            value_cache: List[torch.Tensor] = None,

            ## For debugging
            PRINT_KV_RATIO_INSIDE: bool = False,
            print_KV_inside_per_steps: int = 1000,   
            _seen_tokens: int = 0, 
            _kept_kv_ratio: List[Tuple[int]] = None,
            
            ### For positional encoding shifting
            APPLY_PE_SHIFT: bool = False,
            APPLY_PES_INSIDE: bool = False,
            _shifted_position_ids:  List[torch.Tensor] = None,
            _rope_unsqueeze_dim: int = 1, ## The unsqueeze_dim when applying RoPE.
            _rope_seq_dim: int=1, ## The seq_len dimension for the `cos` or `sin` tensors.
            pe_scaling_factor:float = 1.0,
            pe_dim:int=128, ## The number of dims for positional encoding. Typically, just set the `head_dim` to this.
            max_position_embeddings: int = 8192, 
            base: int=10000,  ## The base for RoPE.               
            
            ## For basic transformer architecture
            k_seq_dim: int=2, ## The dimension for seq_len in key tensors
            v_seq_dim: int=2, ## The dimension for seq_len in value tensors
            layer_num: int = None, ## required for initialization

            model_type: str = 'llama',  ## The model type for running the example. choose from ['llama', 'pythia','falcon'].
            device = None,   

            ## For verbosity of monkey patching
            monkey_patch_verbose: bool = False,

             **kwargs
             ):
    """Custom generate function for SepCache.

    A cache as described in the [SepLLM paper - ICML 2025](https://arxiv.org/abs/2412.12094). In the training phase, 
    SepLLM condenses the segment information into the KV of the separator that divides the segment. In the inference phase, the 
    corresponding SepCache only needs to store the KVs of initial tokens, separator tokens, and recent tokens for generation.

    It stores the Key and Value states as lists of tensors, two lists for each layer. The expected shape for each tensor is
    `[batch_size, num_heads, seq_len, head_dim]`.

    Frequently-Used Parameters:

        `init_cache_size: Union[int, List]`:
            The maximum number of KVs to be stored for initial tokens.
            In the paper, the hyperparameter `a` is an abbreviated alias for `self.init_cache_size`.                
                
        `sep_cache_size: Union[int, List]`:
            The maximum number of KVs to be stored for separator tokens.
            In the paper, the hyperparameter `s` is an abbreviated alias for `self.sep_cache_size`.

        `local_size: Union[int, List]`: 
            The maximum number of KVs to be stored for local tokens (i.e., sliding window).
            In the paper, the hyperparameter `w` is an abbreviated alias for `self.local_size`.

        `cache_size: Union[int, List]`:    
            The maximum number of KVs to be stored for all the tokens, i.e., the size for the whole KV cache.  
            In the paper, the hyperparameter `c` is an abbreviated alias for `self.cache_size`.

        Concerning these four parameters above:
            When a list is passed (its length must be `layer_num`), it represents different values for each layer. 
            When an integer is passed, it means the setting is the same for all layers.
        
        
        `USE_MAX_SEP_CACHE: bool`: 
            If True, it means we only keep at most `self.sep_cache_size` seperators' KVs.  
            If the number exceeds this limit, older separator's KVs will be discarded, keeping only the most recent `self.sep_cache_size` KVs. 
            In the paper, the hyperparameter `s` is an abbreviated alias for `self.sep_cache_size`.
          
        `separator_token_ids: List[int]`:
            The token ids of the separator tokens for the current model's tokenizer.            
            We have some examples, such as the Llama-3 series models, where setting `model_type='llama'` allows you 
                to skip setting `separator_token_ids` and `PADDING_ID` (SepCache will auto-fill them).

        `PADDING_ID: int`:
            The token id of the padding token. You can just set `PADDING_ID` to the id of "<|endoftext|>" token of the tokenizer for the pretrained model.

    Important Note:
        When `cache_size` and `local_size` are set to infinity (i.e., sufficiently large positive integers), and `USE_MAX_SEP_CACHE` is `False`, `SepCache` degenerates into a regular Cache. 
        However, you must always ensure that `init_cache_size` + `sep_cache_size` + `local_size` + `left_padding_offset` < `cache_size`. 
        Here, `left_padding_offset` denotes the number of padding tokens in the record with the largest left paddings within a runtime batch. `left_padding_offset` can only be determined at runtime.        
        To guarantee the above inequality always holds during runtime, when setting, you can intentionally create a sufficient margin between both sides of the following inequality:
            `init_cache_size` + `sep_cache_size` + `local_size`  < `cache_size`, i.e., `a`+`s`+`w`<`c` in the [SepLLM paper - ICML 2025]
            to leave room for `left_padding_offset`.

        Please refer to the `__init__` function's comments for more details on the parameters.
            
    Example:

        ```python        
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, 
        >>> from .sep_cache_utils import SepCache
        >>> import torch
        >>> from huggingface_hub import login
        >>> login("hf_xxxXXXxxx")


        >>> def to_cuda(a_dict: dict) -> dict:
        >>>    new_dict = {}    
        >>>    for k,v in a_dict.items():
        >>>        if isinstance(v, torch.Tensor):
        >>>            new_dict[k] = v.cuda()
        >>>        else:
        >>>            new_dict[k] = v
        >>>    return new_dict

        >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", attn_implementation="flash_attention_2", device_map="cuda:0")
        >>> model.bfloat16().cuda()
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
        >>> inputs = tokenizer(text="My name is Llama 3", return_tensors="pt")
        >>> inputs = to_cuda(inputs)
        >>> # Prepare a cache and pass it to model's forward; `layer_num` is the number of layers for the pretrained model.
        >>> past_key_values = SepCache(init_cache_size=4, sep_cache_size=128, local_size=256, cache_size=512, layer_num=32, USE_MAX_SEP_CACHE=True, model_type='llama')
        >>> # `separator_token_ids` and `PADDING_ID` must also be provided if you are not using `model_type='llama'` like this demo.
        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
        >>> outputs.past_key_values # access SepCache filled with keys/values
        SepCache()
        ```

        ```python
        >>> ## When using the `update` function of SepCache to update the keys/values and the past token ids (necessary in SepCache), the current `input_ids` must also be provided.        
        >>> key_states, value_states = past_key_values.update(                
                    key_states = key_states,
                    value_states = value_states,    
                    input_ids = input_ids,
                    layer_idx = layer_idx,     
                    PREFILLING_FLAG = q_len > 1, ## `q_len` is the sequence length of the current `query_states`
                    )

        ```
        For detailed usage instructions, please refer to https://github.com/HKUDS/SepLLM
    """

    # 0. Monkey Patching for the `update` function of `SepCache`    
    model_layers = monkey_patching(model,  model_atten_forward=llama_atten_forward, verbose=monkey_patch_verbose)

    # 1. General sanity checks
    # 1.a. A few arguments are not allowed, especially arguments that control caches.
    generation_config = kwargs.get("generation_config")
    default_global_generation_config = GenerationConfig()
    default_model_generation_config = model.generation_config
    for arg in UNSUPPORTED_GENERATION_ARGS:
        has_custom_gen_config_arg = (
            generation_config is not None
            # = and not (match global default or match model-specific default)
            and not (
                getattr(default_model_generation_config, arg) == getattr(generation_config, arg)
                or getattr(default_global_generation_config, arg) == getattr(generation_config, arg)
            )
        )
        kwargs_has_arg = arg in kwargs and kwargs[arg] is not None
        if kwargs_has_arg or has_custom_gen_config_arg:
            raise ValueError(
                f"`{arg}` is set, but it's not supported in this custom generate function. List of "
                f"unsupported arguments: {UNSUPPORTED_GENERATION_ARGS}"
            )

    
    
    # 1.b. The model must be decoder-only
    if model.config.is_encoder_decoder:
        raise ValueError("This custom generate function only works with decoder-only models")

    # 1.c. compatibility with transformers 4.52: we must pop `custom_generate` from kwargs, otherwise it will result
    # in an infinite loop when we call `model.generate`. This is solved in transformers 4.53.
    kwargs.pop("custom_generate", None)


    sepllm_kwargs = {}
    sepllm_kwargs["input_ids"] = kwargs["input_ids"] ## `input_ids` must be passed to the `update` function of `SepCache`
    kwargs["sepllm_kwargs"] = sepllm_kwargs

    # 2. Generate with SepCache
    # 2.a. prepare the cache, if it was not passed.
    past_key_values = kwargs.pop("past_key_values", None)
    if past_key_values is None:
        past_key_values = SepCache(                     
            ## For SepCache                              
            init_cache_size = init_cache_size,        
            sep_cache_size = sep_cache_size,
            local_size = local_size, 
            cache_size = cache_size,    
            SEP_ACCUMULATION = SEP_ACCUMULATION,
            USE_MAX_SEP_CACHE = USE_MAX_SEP_CACHE,
            SEP_PADDING_IN_BATCH = SEP_PADDING_IN_BATCH,
            separator_token_ids = separator_token_ids, ## required for initialization if `model_type` is not provided.
            PADDING_ID = PADDING_ID, ## required for initialization if `model_type` is not provided.

            ## For inheritance & initialization states
            past_tok_ids = past_tok_ids,  ## It saves all the token ids corresponding to the saved KVs for all layers in SepCache.                
            key_cache = key_cache,          
            value_cache = value_cache,

            ## For debugging
            PRINT_KV_RATIO_INSIDE = PRINT_KV_RATIO_INSIDE,
            print_KV_inside_per_steps = print_KV_inside_per_steps,   
            _seen_tokens = _seen_tokens, 
            _kept_kv_ratio = _kept_kv_ratio,
            
            ### For positional encoding shifting
            APPLY_PE_SHIFT = APPLY_PE_SHIFT,
            APPLY_PES_INSIDE = APPLY_PES_INSIDE,
            _shifted_position_ids = _shifted_position_ids,
            _rope_unsqueeze_dim = _rope_unsqueeze_dim, ## The unsqueeze_dim when applying RoPE.
            _rope_seq_dim =_rope_seq_dim, ## The seq_len dimension for the `cos` or `sin` tensors.
            pe_scaling_factor = pe_scaling_factor,
            pe_dim = pe_dim, ## The number of dims for positional encoding. Typically, just set the `head_dim` to this, i.e., model.config.hidden_size // model.config.num_attention_heads
            max_position_embeddings = max_position_embeddings, # i.e.,  model.config.max_position_embeddings                               
            base = base,  ## The base for RoPE.               
            
            ## For basic transformer architecture
            k_seq_dim = k_seq_dim, ## The dimension for seq_len in key tensors
            v_seq_dim = v_seq_dim, ## The dimension for seq_len in value tensors
            layer_num = len(model_layers), ## required for initialization. model.config.num_hidden_layers

            model_type = model_type,  ## The model type for running the example. choose from ['llama', 'pythia','falcon'].
            device = device,    
            )

    elif not isinstance(past_key_values, SepCache):
        raise ValueError(f"`past_key_values` must be a `SepCache` instance, got a {type(past_key_values)} instance")

    # 2.b. generate with the cache
    kwargs["use_cache"] = True
    generation_outputs = model.generate(**kwargs, past_key_values=past_key_values)
    return generation_outputs