You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug
I'm trying to deploy a SageMaker endpoint and it gets stuck in "Creating" stage indefinitely. Below is my Dockerfile and training / serving script. The model trains without any issue. Only the Endpoint deployment gets stuck in the "Creating" stage.
To reproduce
Folder structure
|_code |_train_serve.py|_Dockerfile
Dockerfile
# ##########################################################
# Adapt your container (to work with SageMaker)
# # https://docs.aws.amazon.com/sagemaker/latest/dg/adapt-training-container.html
# # https://hub.docker.com/r/huanjason/scikit-learn/dockerfile
ARG REGION=us-east-1
FROM python:3.7
RUN apt-get update && apt-get -y install gcc
RUN pip3 install \
# numpy==1.16.2 \
numpy \
# scikit-learn==0.20.2 \
scikit-learn \
pandas \
# scipy==1.2.1 \
scipy \
mlflow
RUN rm -rf /root/.cache
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
# Install sagemaker-training toolkit to enable SageMaker Python SDK
RUN pip3 install sagemaker-training
ENV PATH="/opt/ml/code:${PATH}"
# Copies the training code inside the container
COPY /code /opt/ml/code
# Defines train_serve.py as script entrypoint
ENV SAGEMAKER_PROGRAM train_serve.py
train_serve.py
importosimportastimportwarningsimportsysimportjsonimportastimportargparseimportpandasaspdimportnumpyasnpfromsklearn.metricsimportmean_squared_error, mean_absolute_error, r2_scorefromsklearn.model_selectionimporttrain_test_splitfromsklearnimportlinear_modelfromsklearn.neighborsimportKNeighborsRegressorfromsklearn.preprocessingimportPolynomialFeaturesfromurllib.parseimporturlparseimportloggingimportpicklelogging.basicConfig(level=logging.INFO)
logger=logging.getLogger(__name__)
defeval_metrics(actual, pred):
rmse=np.sqrt(mean_squared_error(actual, pred))
mae=mean_absolute_error(actual, pred)
r2=r2_score(actual, pred)
returnrmse, mae, r2if__name__=='__main__':
parser=argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.# Data, model, and output directoriesparser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
parser.add_argument('--train-file', type=str, default='kc_house_data_train.csv')
parser.add_argument('--test-file', type=str, default='kc_house_data_test.csv')
parser.add_argument('--features', type=str) # we ask user to explicitly name featuresparser.add_argument('--target', type=str) # we ask user to explicitly name the targetargs, _=parser.parse_known_args()
warnings.filterwarnings("ignore")
np.random.seed(40)
# Reading training and testing datasetslogging.info('reading training and testing datasets')
logging.info(f"{args.train}{args.train_file}{args.test}{args.test_file}")
train_df=pd.read_csv(os.path.join(args.train, args.train_file))
test_df=pd.read_csv(os.path.join(args.test, args.test_file))
logging.info(args.features.split(','))
logging.info(args.target)
train_x=np.array(train_df[args.features.split(',')]).reshape(-1,1)
test_x=np.array(test_df[args.features.split(',')]).reshape(-1,1)
train_y=np.array(train_df[args.target]).reshape(-1,1)
test_y=np.array(test_df[args.target]).reshape(-1,1)
reg=linear_model.LinearRegression()
reg.fit(train_x, train_y)
predicted_price=reg.predict(test_x)
(rmse, mae, r2) =eval_metrics(test_y, predicted_price)
logging.info(f" Linear model: (features={args.features}, target={args.target})")
logging.info(f" RMSE: {rmse}")
logging.info(f" MAE: {mae}")
logging.info(f" R2: {r2}")
model_path=os.path.join(args.model_dir, "model.pkl")
logging.info(f"saving to {model_path}")
logging.info(args.model_dir)
withopen(model_path, 'wb') aspath:
pickle.dump(reg, path)
defmodel_fn(model_dir):
withopen(os.path.join(model_dir, "model.pkl"), "rb") asinput_model:
model=pickle.load(input_model)
returnmodeldefpredict_fn(input_object, model):
_return=model.predict(input_object)
return_return
Expected behavior
SageMaker Endpoint should get deployed successfully
Screenshots or logs
System information
A description of your system.
Include the version of SageMaker Training Toolkit you are using.
Describe the bug
I'm trying to deploy a SageMaker endpoint and it gets stuck in "Creating" stage indefinitely. Below is my Dockerfile and training / serving script. The model trains without any issue. Only the Endpoint deployment gets stuck in the "Creating" stage.
To reproduce
Folder structure
Dockerfile
train_serve.py
Expected behavior
SageMaker Endpoint should get deployed successfully
Screenshots or logs
System information
A description of your system.
Additional context
Add any other context about the problem here.
The text was updated successfully, but these errors were encountered: