#Quantizing huggingface models to exl2
This version of my exl2 quantize colab creates a single quantizaion to upload privatly.\
To calculate an estimate for VRAM size use: [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator)\
Not all models and architectures are compatible with exl2.

In [None]:
#@title Download and install environment
!git clone https://github.com/turboderp/exllamav2
%cd exllamav2
print("Installing pip dependencies")
!pip install -q -r requirements.txt
!pip install -q huggingface_hub requests tqdm
#@markdown Uses [download-model.py](https://github.com/oobabooga/text-generation-webui/blob/main/download-model.py) by [oobabooga](https://github.com/oobabooga)
!wget https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py
model = "none"
dsd = 'false'

In [None]:
#@title Login to HF (Required to upload files)
#@markdown From my Colab/Kaggle login script on [Anthonyg5005/hf-scripts](https://huggingface.co/Anthonyg5005/hf-scripts/blob/main/HF%20Login%20Snippet%20Kaggle.py)
#import required functions
import os
from huggingface_hub import login, get_token, whoami

#get token
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None: #check if user in kaggle
    from kaggle_secrets import UserSecretsClient
    from kaggle_web_client import BackendError
    try:
        login(UserSecretsClient().get_secret("HF_TOKEN")) #login if token secret found
    except BackendError:
        print('''
            When using Kaggle, make sure to use the secret key HF_TOKEN with a 'WRITE' token.
                   This will prevent the need to login every time you run the script.
                   Set your secrets with the secrets add-on on the top of the screen.
             ''')
if get_token() is not None:
    #if the token is found then log in:
    login(get_token())
else:
    #if the token is not found then prompt user to provide it:
    login(input("API token not detected. Enter your HuggingFace (WRITE) token: "))

#if the token is read only then prompt user to provide a write token (Only required if user needs a WRITE token, remove if READ is enough):
while True:
    if whoami().get('auth', {}).get('accessToken', {}).get('role', None) != 'write':
        if os.environ.get('HF_TOKEN', None) is not None: #if environ finds HF_TOKEN as read-only then display following text and exit:
            print('''
          You have the environment variable HF_TOKEN set.
          You cannot log in.
          Either set the environment variable to a 'WRITE' token or remove it.
                  ''')
            input("Press enter to continue.")
            exit()
        if os.environ.get('COLAB_BACKEND_VERSION', None) is not None:
            print('''
                              Your Colab secret key is read-only
                Please switch your key to 'write' or disable notebook access on the left.
                               For now, you are stuck in a loop
                  ''')
        elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE', None) is not None:
            print('''
                                      Your Kaggle secret key is read-only
                Please switch your key to 'write' or unattach from notebook in add-ons at the top.
                          Having a read-only key attched will require login every time.
                ''')
        print("You do not have write access to this repository. Please use a valid token with (WRITE) access.")
        login(input("Enter your HuggingFace (WRITE) token: "))
        continue
    break

In [None]:
#@title ##Choose HF model to download
#@markdown ###Repo should be formatted as user/repo
#@markdown Weights must be stored in safetensors
if model != "none":
    !rm {model}-{BPW}bpw.zip
    !rm -r {model}-exl2-{BPW}bpw
repo_url = "mistralai/Mistral-7B-Instruct-v0.2" # @param {type:"string"}
model = repo_url.replace("/", "_")
!python download-model.py {repo_url}

In [None]:
#@title Quantize the model
#@markdown ###Takes ~13 minutes to start quantizing first time, then quantization will last based on model size
#@markdown Target bits per weight:
BPW = "4.125" # @param {type:"string"}
!mkdir {model}-exl2-{BPW}bpw-WD
!mkdir {model}-exl2-{BPW}bpw
!cp models/{model}/config.json {model}-exl2-{BPW}bpw-WD
#@markdown Calibrate with dataset, may improve model output (optional):
Calibrate = True # @param {type:"boolean"}
#@markdown Calibration dataset, enable calibrate above (must be parquet file):
if Calibrate == True:
    dataset_url = "https://huggingface.co/datasets/wikitext/resolve/refs%2Fconvert%2Fparquet/wikitext-103-v1/test/0000.parquet?download=true" # @param {type:"string"}
    dataset_url = dataset_url.replace("?download=true", "")
    if dsd == 'false':
        !wget {dataset_url}
        dsd = 'true'
    dataset = dataset_url.split("/")[-1]
#@markdown To use a calibration dataset, enter the huggingface resolve url. Right click the download button and copy the link. Afterwards, paste the link into dataset_url.
#@markdown ![Example Image](https://huggingface.co/Anthonyg5005/hf-scripts/resolve/main/Screenshot%202024-03-17%20011855.png "Copy from download button")
if Calibrate == True:
    quant = f"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -c {dataset} -b {BPW}"
else:
    quant = f"convert.py -i models/{model} -o {model}-exl2-{BPW}bpw-WD -cf {model}-exl2-{BPW}bpw -b {BPW}"
!python {quant}

In [None]:
#@title Upload to huggingface privately
#@markdown You may also set it to public but I'd recommend waiting for my next ipynb that will create mutliple quants and place them all into individual branches.
!rm -r {model}-exl2-{BPW}bpw-WD
!rm -r models/{model}
print("Uploading to Huggingface. May take a while")
from huggingface_hub import HfApi, whoami, create_repo
create_repo(f"{whoami().get('name', None)}/{model}-exl2-{BPW}bpw", private=True)
HfApi().upload_folder(folder_path=f"{model}-exl2-{BPW}bpw", repo_id=f"{whoami().get('name', None)}/{model}-exl2-{BPW}bpw", repo_type="model", commit_message="Upload from Colab automation")
print(f"uploaded to {whoami().get('name', None)}/{model}-exl2-{BPW}bpw")