!pip install -q -U transformers
!pip install -q -U bitsandbytes
!pip install -q -U accelerate


from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig


def get_model_info(bnb_config):
  model_id = "facebook/opt-350m"
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', quantization_config = bnb_config)
  #print(model)
  print(f"Model memory: {model.get_memory_footprint() / 1e6:,.0f} MB")


  layer = model.get_parameter('model.decoder.project_in.weight')
  print(f'Layer: {layer}')
  print(f"Model weight shape: {layer.shape}")
  print(f"Model weight dtype: {layer.dtype}")

  w = layer.detach().cpu().numpy()

  return w


wf = get_model_info(bnb_config=None)
wf[0][:5]

Model memory: 1,325 MB
Layer: Parameter containing:
tensor([[ 0.1122, -0.0844, -0.0203,  ...,  0.0970,  0.0074,  0.0431],
        [-0.0696, -0.0037, -0.0627,  ...,  0.0359, -0.0157,  0.0105],
        [-0.0268,  0.0077,  0.0630,  ..., -0.0326,  0.0033, -0.0459],
        ...,
        [ 0.0150, -0.0346, -0.0784,  ...,  0.0442,  0.0326,  0.0418],
        [ 0.0468, -0.0705,  0.0620,  ...,  0.0169,  0.0159,  0.0397],
        [-0.0149,  0.0487,  0.0774,  ...,  0.0274, -0.0091, -0.0626]],
       device='cuda:0', requires_grad=True)
Model weight shape: torch.Size([1024, 512])
Model weight dtype: torch.float32

array([ 0.11218262, -0.08435059, -0.02027893,  0.02336121,  0.01412964],
      dtype=float32)


bnb_config = BitsAndBytesConfig(load_in_8bit=True)
w8b = get_model_info(bnb_config=bnb_config)
w8b[0][:5]

Model memory: 359 MB
Layer: Parameter containing:
Parameter(Int8Params([[114, -86, -21,  ...,  99,   8,  44],
            [-71,  -4, -64,  ...,  37, -16,  11],
            [-27,   8,  64,  ..., -33,   3, -47],
            ...,
            [ 15, -35, -80,  ...,  45,  33,  42],
            [ 47, -71,  63,  ...,  17,  16,  40],
            [-15,  50,  79,  ...,  28,  -9, -64]], device='cuda:0',
           dtype=torch.int8))
Model weight shape: torch.Size([1024, 512])
Model weight dtype: torch.int8

array([114, -86, -21,  24,  14], dtype=int8)


bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=False, bnb_4bit_quant_type="fp4")
w4b = get_model_info(bnb_config=bnb_config)

Model memory: 208 MB
Layer: Parameter containing:
Parameter(Params4bit([[ 58],
            [230],
            [100],
            ...,
            [220],
            [ 39],
            [154]], device='cuda:0', dtype=torch.uint8))
Model weight shape: torch.Size([262144, 1])
Model weight dtype: torch.uint8


w4b[:5]

array([[ 58],
       [230],
       [100],
       [236],
       [ 91]], dtype=uint8)


bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=False, bnb_4bit_quant_type="nf4")
w4bnf4 = get_model_info(bnb_config=bnb_config)

Model memory: 208 MB
Layer: Parameter containing:
Parameter(Params4bit([[241],
            [ 90],
            [155],
            ...,
            [ 36],
            [234],
            [ 98]], device='cuda:0', dtype=torch.uint8))
Model weight shape: torch.Size([262144, 1])
Model weight dtype: torch.uint8


bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
w4b_dq = get_model_info(bnb_config=bnb_config)

Model memory: 208 MB
Layer: Parameter containing:
Parameter(Params4bit([[ 58],
            [230],
            [100],
            ...,
            [220],
            [ 39],
            [154]], device='cuda:0', dtype=torch.uint8))
Model weight shape: torch.Size([262144, 1])
Model weight dtype: torch.uint8

Quantization

Overview¶

Summary¶

Open Questions¶

Notes¶

Handy References¶

Code¶

Imports¶

Import Full Model¶

Import 8-bit¶

Import 4-bit & FP4¶

4-bit & NF4¶

4-bit double quant¶