Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jupyterlab (strict mode) Marathon-lb #44

Open
redpine42 opened this issue Dec 18, 2018 · 8 comments
Open

Jupyterlab (strict mode) Marathon-lb #44

redpine42 opened this issue Dec 18, 2018 · 8 comments

Comments

@redpine42
Copy link

I'm running DCOS 1.11.6, with Jupyterlab 1.2.0-DEV.33.7 and Marathon-lb 1.12.3. DCOS is configured for strict mode, with necessary secrets and service accounts configured. Marathon-lb is pointing to the container port 8080 and not the vip port 8888. Health checks are failing Jupyterlab is unreachable through marathon-lb. All the other Jupyterlab containers are showing healthy in Marathon-lb (same container and vip ports in the marathon.json.mustache file). In the past I fixed this by setting the vip and container port to the same value (8080). Which seems to be convention with Marathon-lb.

@vishnu2kmohan
Copy link
Collaborator

@redpine42 Have you looked at trying to (modify to suit and) use:

I use those ^^^ all the time for testing and they work just fine with DC/OS in strict mode and Marathon-LB (also in it's own strict mode).

We have some changes planned to make the configuration of the Jupyter Service a bit more intuitive, at which time I'll be putting those changes through the paces of strict mode(s).

@vishnu2kmohan
Copy link
Collaborator

@redpine42 We've released a new version 1.3.0-0.35.4 of the service and it's now renamed to beta-mesosphere-jupyter-service in the DC/OS Catalog

Please give this new version a spin and let me know if it resolves your issues.

@redpine42
Copy link
Author

Thanks, but I'm still having problems with marathon-lb showing it down. I've tried both from the Universe catalog and a marthon file. Below is my marathon.

{
  "service": {
    "name": "/jupyter",
    "cpus": 18,
    "mem": 61440,
    "gpu": {
      "enabled": true,
      "gpus": 2
    },
    "jupyter_password": "jupyter",
    "jupyter_conf_urls": "",
    "service_account": "jupyter",
    "service_account_secret": "jupyter/sa",
    "placement_constraints": "[]",
    "user": "nobody",
    "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
    "log_level": "INFO"
  },
  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": ""
    }
  },
  "storage": {
    "local_persistence": {
      "enabled": true,
      "volume_size": 100000,
      "volume_path": "jupyter_data"
    },
    "s3": {
      "aws_region": "us-east-1",
      "endpoint": "jupyterlab-data",
      "use_https": true,
      "verify_ssl": true
    }
  },
  "oidc": {
    "enabled": false,
    "discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
    "client_id": "notebook",
    "client_secret": "",
    "scope": "openid profile email",
    "authorization_params": "",
    "authorized_email": "",
    "authorized_upn": "",
    "redirect_after_logout_uri": "",
    "post_logout_redirect_uri": "",
    "tls_verify": false,
    "redirect_uri": "/oidc-redirect-callback",
    "logout_path": "/logmeout",
    "token_endpoint_auth_method": "client_secret_basic",
    "use_spartan_resolver": true
  },
  "spark": {
    "spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
    "spark_conf_cores_max": 5,
    "spark_driver_cores": 2,
    "spark_conf_executor_cores": 1,
    "spark_conf_mesos_gpus_max": 0,
    "spark_conf_mesos_executor_gpus": 0,
    "spark_driver_memory": "6g",
    "spark_conf_executor_memory": "6g",
    "spark_conf_eventlog_enabled": true,
    "spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
    "start_spark_history_server": true,
    "spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
    "spark_conf_jars_packages": "",
    "spark_conf_mesos_principal": "",
    "spark_conf_mesos_role": "",
    "spark_conf_mesos_driver_labels": "",
    "spark_conf_mesos_task_labels": "",
    "spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
    "spark_conf_mesos_containerizer": "mesos",
    "spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
    "spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_mesos_executor_home": "/opt/spark",
    "spark_conf_executor_java_home": "/opt/jdk",
    "spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
    "spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_mesos_executor_docker_forcepullimage": false,
    "spark_user": "nobody"
  },
  "advanced": {
    "force_pull_jupyter_image": false,
    "force_pull_worker_image": false,
    "home": "/mnt/mesos/sandbox",
    "sandbox": "/mnt/mesos/sandbox",
    "hadoop_conf_dir": "/mnt/mesos/sandbox",
    "jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
    "jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
    "conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
    "conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
    "dcos_dir": "/mnt/mesos/sandbox/.dcos",
    "java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "nginx_log_level": "warn",
    "spark_monitor_enabled": true,
    "start_dask_distributed": false,
    "start_ray_head_node": false,
    "start_tensorboard": true,
    "tensorboard_logdir": "/mnt/mesos/sandbox",
    "term": "xterm-256color"
  },
  "healthChecks": [
    {
      "portIndex": 0,
      "protocol": "MESOS_HTTP",
      "path": "/healthz",
      "gracePeriodSeconds": 30,
      "intervalSeconds": 20,
      "timeoutSeconds": 10,
      "maxConsecutiveFailures": 3
    }
  ],
  "labels": {
    "MARATHON_SINGLE_INSTANCE_APP": "true",
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_0_REDIRECT_TO_HTTPS": "true",
    "HAPROXY_0_VHOST": "jupyter.redpine.com"
  }

}

@vishnu2kmohan
Copy link
Collaborator

@redpine42 We completely ignore this labels section since it's not part of the config.json schema:

  "labels": {
    "MARATHON_SINGLE_INSTANCE_APP": "true",
    "HAPROXY_GROUP": "external",
    "HAPROXY_0_ENABLED": "true",
    "HAPROXY_0_REDIRECT_TO_HTTPS": "true",
    "HAPROXY_0_VHOST": "jupyter.redpine.com"
  }

You need to set networking.ingress.hostname:

  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": "jupyter.redpine.com"
    }
  },

And then, assuming your Public Agent(s) (or Elastic Load Balancer that fronts your Public Agent(s)) have a DNS A (or CNAME) record for jupyter.redpine.com pointing to its IP(s), you should be able to access your Notebook at https://jupyter.redpine.com/jupyter

We also ignore this healthChecks stanza:

  "healthChecks": [
    {
      "portIndex": 0,
      "protocol": "MESOS_HTTP",
      "path": "/healthz",
      "gracePeriodSeconds": 30,
      "intervalSeconds": 20,
      "timeoutSeconds": 10,
      "maxConsecutiveFailures": 3
    }
  ],

@redpine42
Copy link
Author

I added the health checks and labels based upon the cni example you provided. Other than that the rest is what comes from the gui generated json. I'll try adding the networking section.

@redpine42
Copy link
Author

The networking section I did have the hostname set on a previous test with no luck.

Removed healthChecks, and labels. Added hostname still marathon-lb shows the backend down even though DC/OS shows jupyter up. Returns 503 when I hit https://jupyter.redpine.com. I did have all this working before. Only difference was I was using Traefik, which wasn't able to get to work in strict mode.

@vishnu2kmohan
Copy link
Collaborator

The schema for the options.json (Catalog Package Deployment UI) is not the same as the rendered Marathon App JSON.

We'll need to see the full (obfuscated) rendered Marathon JSON files for both your Marathon-LB and Mesosphere Jupyter Service to see what may be happening @redpine42

@redpine42
Copy link
Author

It's a pretty standard marathon-lb. I've hooked up other projects, such as jenkins. Here is the json for it pulled from the ui.

{
  "marathon-lb": {
    "auto-assign-service-ports": false,
    "bind-http-https": true,
    "cpus": 2,
    "haproxy_global_default_options": "redispatch,http-server-close,dontlognull",
    "haproxy-group": "external",
    "haproxy-map": true,
    "instances": 2,
    "mem": 1024,
    "minimumHealthCapacity": 0.5,
    "maximumOverCapacity": 0.2,
    "name": "marathon-lb",
    "parameters": [],
    "role": "slave_public",
    "strict-mode": false,
    "sysctl-params": "net.ipv4.tcp_tw_reuse=1 net.ipv4.tcp_fin_timeout=30 net.ipv4.tcp_max_syn_backlog=10240 net.ipv4.tcp_max_tw_buckets=400000 net.ipv4.tcp_max_orphans=60000 net.core.somaxconn=10000",
    "container-syslogd": false,
    "max-reload-retries": 10,
    "reload-interval": 10,
    "template-url": "",
    "marathon-uri": "https://marathon.mesos:8443",
    "secret_name": "marathon-lb/sa"
  }
}

Here is Jupyter pulled from the DC/OS UI

{
  "service": {
    "name": "/jupyter",
    "cpus": 18,
    "mem": 61440,
    "gpu": {
      "enabled": true,
      "gpus": 2
    },
    "jupyter_password": "jupyter",
    "jupyter_conf_urls": "",
    "service_account": "jupyter",
    "service_account_secret": "jupyter/sa",
    "placement_constraints": "[]",
    "user": "nobody",
    "cmd": "/usr/local/bin/start.sh ${CONDA_DIR}/bin/jupyter lab --notebook-dir=\"${MESOS_SANDBOX}\"",
    "log_level": "INFO"
  },
  "networking": {
    "cni_network_enabled": true,
    "cni_network_name": "dcos",
    "cni_network_plugin_labels": "",
    "ingress": {
      "enabled": true,
      "hostname": "jupyter.redpine.com"
    }
  },
  "storage": {
    "local_persistence": {
      "enabled": true,
      "volume_size": 100000,
      "volume_path": "jupyter_data"
    },
    "s3": {
      "aws_region": "us-east-1",
      "endpoint": "jupyterlab-data",
      "use_https": true,
      "verify_ssl": true
    }
  },
  "oidc": {
    "enabled": false,
    "discovery_uri": "https://keycloak.contoso.com/auth/realms/notebook/.well-known/openid-configuration",
    "client_id": "notebook",
    "client_secret": "",
    "scope": "openid profile email",
    "authorization_params": "",
    "authorized_email": "",
    "authorized_upn": "",
    "redirect_after_logout_uri": "",
    "post_logout_redirect_uri": "",
    "tls_verify": false,
    "redirect_uri": "/oidc-redirect-callback",
    "logout_path": "/logmeout",
    "token_endpoint_auth_method": "client_secret_basic",
    "use_spartan_resolver": true
  },
  "spark": {
    "spark_master_url": "mesos://zk://zk-1.zk:2181,zk-2.zk:2181,zk-3.zk:2181,zk-4.zk:2181,zk-5.zk:2181/mesos",
    "spark_conf_cores_max": 5,
    "spark_driver_cores": 2,
    "spark_conf_executor_cores": 1,
    "spark_conf_mesos_gpus_max": 0,
    "spark_conf_mesos_executor_gpus": 0,
    "spark_driver_memory": "6g",
    "spark_conf_executor_memory": "6g",
    "spark_conf_eventlog_enabled": true,
    "spark_conf_eventlog_dir": "/mnt/mesos/sandbox/",
    "start_spark_history_server": true,
    "spark_history_fs_logdirectory": "/mnt/mesos/sandbox/",
    "spark_conf_jars_packages": "",
    "spark_conf_mesos_principal": "",
    "spark_conf_mesos_role": "",
    "spark_conf_mesos_driver_labels": "",
    "spark_conf_mesos_task_labels": "",
    "spark_conf_executor_krb5_config": "/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_spark_scheduler_min_registered_resources_ratio": 1,
    "spark_conf_mesos_containerizer": "mesos",
    "spark_conf_hadoop_fs_s3a_aws_credentials_provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider",
    "spark_driver_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_executor_java_options": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "spark_conf_mesos_executor_home": "/opt/spark",
    "spark_conf_executor_java_home": "/opt/jdk",
    "spark_conf_executor_hadoop_hdfs_home": "/opt/hadoop",
    "spark_conf_executor_hadoop_opts": "-Djava.library.path=/opt/hadoop/lib/native -Djava.security.krb5.conf=/mnt/mesos/sandbox/krb5.conf",
    "spark_conf_mesos_executor_docker_forcepullimage": false,
    "spark_user": "nobody"
  },
  "advanced": {
    "force_pull_jupyter_image": false,
    "force_pull_worker_image": false,
    "home": "/mnt/mesos/sandbox",
    "sandbox": "/mnt/mesos/sandbox",
    "hadoop_conf_dir": "/mnt/mesos/sandbox",
    "jupyter_config_dir": "/mnt/mesos/sandbox/.jupyter",
    "jupyter_runtime_dir": "/mnt/mesos/sandbox/.local/share/jupyter/runtime",
    "conda_envs_path": "/mnt/mesos/sandbox/conda/envs:/opt/conda/envs",
    "conda_pkgs_dir": "/mnt/mesos/sandbox/conda/pkgs:/opt/conda/pkgs",
    "dcos_dir": "/mnt/mesos/sandbox/.dcos",
    "java_opts": "-server -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/mnt/mesos/sandbox",
    "nginx_log_level": "warn",
    "spark_monitor_enabled": true,
    "start_dask_distributed": false,
    "start_ray_head_node": false,
    "start_tensorboard": true,
    "tensorboard_logdir": "/mnt/mesos/sandbox",
    "term": "xterm-256color"
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants