Databricks Clusters on GCP stop working "Environment directory not found" issue - waitForEnvironmentFileSystem

720677
New Contributor III

Starting from yesterday 17/5/2022 i start getting errors while running notebooks or jobs on clusters of Databricks GCP.

The error is:

SparkException: Environment directory not found at /local_disk0/.ephemeral_nfs/cluster_libraries/python

The job/notebooks can do some of the operations but some of the operations like:

display(dbutils.fs.ls("/%s" % mount_name))

I tried to start a new cluster. I tried to reduce any init scripts.

The full error:

22/05/18 05:30:09 WARN TaskSetManager: Lost task 3.0 in stage 0.0 (TID 3) (10.71.1.3 executor 0): org.apache.spark.SparkException: Environment directory not found at /local_disk0/.ephemeral_nfs/cluster_libraries/python

at org.apache.spark.util.DatabricksUtils$.waitForEnvironmentFileSystem(DatabricksUtils.scala:685)

at org.apache.spark.api.python.PythonWorkerFactory.$anonfun$startDaemon$1(PythonWorkerFactory.scala:273)

at org.apache.spark.api.python.PythonWorkerFactory.$anonfun$startDaemon$1$adapted(PythonWorkerFactory.scala:273)

at scala.Option.foreach(Option.scala:407)

at org.apache.spark.api.python.PythonWorkerFactory.startDaemon(PythonWorkerFactory.scala:273)

at org.apache.spark.api.python.PythonWorkerFactory.createThroughDaemon(PythonWorkerFactory.scala:185)

at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:134)

at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:209)

at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:251)

at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:77)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.sql.execution.SQLExecutionRDD.$anonfun$compute$1(SQLExecutionRDD.scala:57)

at org.apache.spark.sql.internal.SQLConf$.withExistingConf(SQLConf.scala:170)

at org.apache.spark.sql.execution.SQLExecutionRDD.compute(SQLExecutionRDD.scala:57)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)

at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:380)

at org.apache.spark.rdd.RDD.iterator(RDD.scala:344)

at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)

at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)

at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)

at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)

at org.apache.spark.scheduler.Task.doRunTask(Task.scala:156)

at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:125)

at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)

at org.apache.spark.scheduler.Task.run(Task.scala:95)

at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:826)

at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1670)

at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:829)

at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)

at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:684)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)

at java.lang.Thread.run(Thread.java:748)