CDK Entest

Introduction#

GitHub this note shows how to deploy

Tradditional models on SageMaker
LLM models on SageMaker

There are many ways to deploy a model on SageMaker, but in short, there are steps under the hood

Select container image_uri which run the training or inference
Prepare model_data which inference or training code and trained model.pht
Configure endpoint configuration
How to invoke an endpoint
Custom model_fn, input_fn, predict_fn, output_fn

Create Model#

Let create a very simple model consisting of only a linear layer and init with all ones weigths.

import torch
from torch import nn

def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.ones_(m.weight)
        torch.nn.init.zeros_(m.bias)

# create model
model = torch.nn.Sequential(
    torch.nn.Linear(3,1),
    torch.nn.Flatten(0,1)
)
# init weights ones
model.apply(weights_init)
# test input
z = torch.tensor([[1,2,3],[1,2,3]], dtype=torch.float)
print(model(z))
# expected output [6.0, 6.0]

Model Data#

The model.tar.gz should have a proper structure as the following, please check HERE

model.tar.gz
|--model.pth
|--code
   |--inference.py
   |--requirements.txt

Let create inference.py which defines, please check this to see how it works under the hood

model_fn how to load model
input_fn parse input from request
predict_fn call the model
output_fn format output for response

import torch
from torch import nn
import json

def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.ones_(m.weight)
        torch.nn.init.zeros_(m.bias)


def model_fn(model_dir, context):
    """
    most simple model
    """
    model = torch.nn.Sequential(
        torch.nn.Linear(3,1),
        torch.nn.Flatten(0,1)
    )
    # init weights ones
    model.apply(weights_init)
    return model

def input_fn(request_body, request_content_type, context):
    """
    parse input
    """
    print(request_body)
    try:
        assert request_content_type == "application/json"
        data = json.loads(request_body)["inputs"]
        data = torch.tensor(data, dtype=torch.float, device="cpu")
    except:
        # fixed data for testing
        data = torch.tensor([[1,2,3],[1,2,3]],dtype=torch.float)
    return data

# inference
def predict_fn(input_object, model, context):
    """
    simple inference
    """
    with torch.no_grad():
        prediction = model(input_object)
    return prediction

Deploy Model#

Create model_data which is model.tar.gz and upload to S3
Create and register a model with SM
Create an endpoint

Let create model_data

tar cvfz model.tar.gz code/

Upload the model.tar.gz to S3

sagemaker.s3.S3Uploader.upload(
    "model.tar.gz",
    f"s3://{bucket}/my-model"
)

Then create a model

model_data = f"s3://{bucket}/my-model/model.tar.gz"
image_uri = image_uris.retrieve(
    framework="pytorch",
    region="us-east-1",
    version="2.1.0",
    py_version="py310",
    instance_type="ml.c5.xlarge",
    image_scope="inference",
    # accelerator_type="CPU",
)

model = sagemaker.Model(
    image_uri=image_uri,
    model_data=model_data,
    role=os.environ['ROLE'],
    # entry_point="inference.py",
    # source_dir="code"
)

Then register the model with SageMaker

model.create(
    instance_type="ml.c5.xlarge",
    tags=[{'Key': 'Name', 'Value':'MyModel'}],
)

Finally let deploy an endpoint

model.deploy(
    instance_type="ml.c5.xlarge",
    initial_instance_count=1,
)
print(model.endpoint_name)

Invoke Endpoint#

We can send requests to the deployed endpoint, there are different ContentType such as application/json or text/csv.

import boto3
import json

sm_client = boto3.client('sagemaker-runtime')

response = sm_client.invoke_endpoint(
    EndpointName=model.endpoint_name,
    ContentType="application/json",
    Body=json.dumps({"inputs": [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]})
)

print(response['Body'].read().decode('utf-8'))

JumpStartModel#

To deploy a LLM model using JumpStartModel is a good way because the large pretrained model files (GB) has been downloaded in S3, the inference.py already prepared for us. For example, let deploy a mistral-7B model

from sagemaker.jumpstart.model import JumpStartModel

model = JumpStartModel(
    model_id="huggingface-llm-mistral-7b-instruct",
    role=os.environ["ROLE"])

# check model data can download from s3 to local
print(model.model_data)
print(model.image_uri)

# deploy model
model.deploy()

[!IMPORTANT] Please check InferenceComponentName

Then let invoke the deployed enpoint

import boto3
import json

# create sm client
client = boto3.client("runtime.sagemaker")

# format prompts
payload = {
    "inputs": '<s>[INST] How to cook chicken soup? [/INST] ',
    "parameters": {"max_new_tokens": 256, "do_sample": True, "temperature": 0.2}
}

# invoke model
response = client.invoke_endpoint(
    EndpointName="jumpstart-dft-hf-llm-mistral-7b-ins-20240119-085114",
    ContentType="application/json",
    Body=json.dumps(payload).encode("utf-8"),
    CustomAttributes='accept_eula=true',
    InferenceComponentName="jumpstart-dft-hf-llm-mistral-7b-ins-20240119-0-20240119-0851320"
)

# parse response
response = response["Body"].read().decode("utf8")
response = json.loads(response)
print(response[0]['generated_text'])

Create Model#

Model Data#

Deploy Model#

Invoke Endpoint#

JumpStartModel#

Reference#