Introduction#

GitHub this note shows basic concepts and applications of SageMaker Feature Store

  • Feature group
  • Ingest dataframe into a feature group
  • Feature group, Glue Catalog and Athena Query
  • AmazonSageMakerFeatureStoreAccess policy

Create a Feature Group#

from sagemaker.feature_store.feature_group import FeatureGroup
customers_feature_group = FeatureGroup(
name=customers_feature_group_name,
sagemaker_session=sagemaker_session
)
orders_feature_group = FeatureGroup(
name=orders_feature_group_name,
sagemaker_session=sagemaker_session
)

Create definition for feature groups

customers_feature_group.load_feature_definitions(data_frame=customer_data)
orders_feature_group.load_feature_definitions(data_frame=orders_data)

Ingest dataframe into feature groups

customers_feature_group.create(
s3_uri=f"s3://{s3_bucket_name}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name="EventTime",
role_arn=role,
enable_online_store=True,
)
orders_feature_group.create(
s3_uri=f"s3://{s3_bucket_name}/{prefix}",
record_identifier_name=record_identifier_feature_name,
event_time_feature_name="EventTime",
role_arn=role,
enable_online_store=True,
)

Add Metadata

from sagemaker.feature_store.inputs import FeatureParameter
customers_feature_group.update_feature_metadata(
feature_name="customer_id",
description="The ID of a customer. It is also used in orders_feature_group.",
parameter_additions=[FeatureParameter("idType", "primaryKey")],
)
customers_feature_group.describe_feature_metadata(feature_name="customer_id")

Feature Store API#

Case 1) use Feature Store API to list existing feature groups

from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.inputs import Identifier
feature_store = FeatureStore(
sagemaker_session=sagemaker_session
)
feature_store.list_feature_groups()

Case 2) Get Batch Records

feature_store.batch_get_record(
identifiers=[
Identifier(
feature_group_name="customers-feature-group-28-03-41-44",
record_identifiers_value_as_string= ["573291", "109382", "828400", "124013"]
)
]
)

Case 3) Athena Query

from sagemaker.feature_store.feature_group import AthenaQuery
customer_query = AthenaQuery(
sagemaker_session=sagemaker_session,
catalog="AwsDataCatalog",
database="sagemaker_featurestore",
table_name="customers_feature_group_28_03_41_44_1685245305"
)
customer_query.run(
'select * from customers_feature_group_28_03_41_44_1685245305',
output_location=f"s3://{s3_bucket_name}/notebook-athena-result/"
)
customer_df = customer_query.as_dataframe()
customer_df.head(10)

References#