<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Failed to start cluster: Large docker image in Administration &amp; Architecture</title>
    <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/52425#M602</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;it's not possible to change the timeout value for the Docker image pull on a Databricks cluster.&amp;nbsp; That isn't exposed to the user.&lt;/P&gt;</description>
    <pubDate>Thu, 16 Nov 2023 16:38:51 GMT</pubDate>
    <dc:creator>Michelangelo</dc:creator>
    <dc:date>2023-11-16T16:38:51Z</dc:date>
    <item>
      <title>Failed to start cluster: Large docker image</title>
      <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/41218#M321</link>
      <description>&lt;P&gt;I have a large Docker image in our AWS ECR repo. The image is 27.4 GB locally and 11539.79 MB compressed in ECR.&lt;/P&gt;&lt;P&gt;The error from the Event Log is:&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Failed to add 2 containers to the compute. Will attempt retry: true. Reason: Docker image pull failure&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;JSON:&lt;/P&gt;&lt;PRE&gt;{
  "reason": {
    "code": "DOCKER_IMAGE_PULL_FAILURE",
    "type": "SERVICE_FAULT",
    "parameters": {
      "instance_id": "i-0172cf9b70a25df47",
      "databricks_error_message": "Downloading docker image has timed out"
    }
  },
  "add_node_failure_details": {
    "failure_count": 2,
    "resource_type": "container",
    "will_retry": true
  }
}&amp;nbsp;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 23 Aug 2023 19:52:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/41218#M321</guid>
      <dc:creator>NateJ</dc:creator>
      <dc:date>2023-08-23T19:52:14Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to start cluster: Large docker image</title>
      <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/50195#M567</link>
      <description>&lt;P&gt;I'm having the same issue--the official Databricks runtime GPU images are already quite large, so using them as a base causes you to run into this timeout issue.&amp;nbsp; Did anyone ever find a fix?&lt;/P&gt;</description>
      <pubDate>Tue, 31 Oct 2023 14:54:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/50195#M567</guid>
      <dc:creator>Michelangelo</dc:creator>
      <dc:date>2023-10-31T14:54:26Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to start cluster: Large docker image</title>
      <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/51213#M583</link>
      <description>&lt;P&gt;I have a similar problem. a 10gb image pulls fine but a 31gb image doesnt. both workers and drivers have 64gb memory. i get the timeout error with "&lt;SPAN class=""&gt;Cannot launch the cluster because pulling the docker image failed. Please double check connectivity from workers to the container registry, as well as the credentials used to pull the image"&lt;BR /&gt;&lt;BR /&gt;were you able to figure out a solution?&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 13 Nov 2023 21:05:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/51213#M583</guid>
      <dc:creator>amoghjain</dc:creator>
      <dc:date>2023-11-13T21:05:40Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to start cluster: Large docker image</title>
      <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/52425#M602</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;it's not possible to change the timeout value for the Docker image pull on a Databricks cluster.&amp;nbsp; That isn't exposed to the user.&lt;/P&gt;</description>
      <pubDate>Thu, 16 Nov 2023 16:38:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/52425#M602</guid>
      <dc:creator>Michelangelo</dc:creator>
      <dc:date>2023-11-16T16:38:51Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to start cluster: Large docker image</title>
      <link>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/52426#M603</link>
      <description>&lt;P&gt;The only solution as of now is to reduce the size of your image--try a smaller base image, don't build multiple intermediate images that build off of each other, reduce the number of layers, aggressively purge apt and pip caches, etc.&lt;/P&gt;</description>
      <pubDate>Thu, 16 Nov 2023 16:40:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/failed-to-start-cluster-large-docker-image/m-p/52426#M603</guid>
      <dc:creator>Michelangelo</dc:creator>
      <dc:date>2023-11-16T16:40:15Z</dc:date>
    </item>
  </channel>
</rss>

