โ09-24-2021 09:24 AM
I'm trying to execute this writeStream
data_frame.writeStream.format("delta") \
.option("checkpointLocation", checkpoint_path) \
.trigger(processingTime="1 second") \
.option("mergeSchema", "true") \
.outputMode("append") \
.table(write_stream_path)
but I get this error
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:428)
at org.apache.spark.util.ThreadUtils$.parallelMap(ThreadUtils.scala:399)
at com.databricks.sql.streaming.state.RocksDBFileManager.loadImmutableFilesFromDbfs(RocksDBFileManager.scala:433)
at com.databricks.sql.streaming.state.RocksDBFileManager.loadCheckpointFromDbfs(RocksDBFileManager.scala:202)
at com.databricks.sql.rocksdb.CloudRocksDB.$anonfun$open$5(CloudRocksDB.scala:437)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:627)
at com.databricks.sql.rocksdb.CloudRocksDB.timeTakenMs(CloudRocksDB.scala:523)
at com.databricks.sql.rocksdb.CloudRocksDB.$anonfun$open$2(CloudRocksDB.scala:435)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:369)
at com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:457)
at com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:477)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:240)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:235)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:232)
at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:20)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:279)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:271)
at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:20)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:452)
at com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:378)
at com.databricks.spark.util.PublicDBLogging.recordOperationWithResultTags(DatabricksSparkUsageLogger.scala:20)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:369)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:341)
at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:20)
at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:57)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:125)
at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:70)
at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:57)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:86)
at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:402)
at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:381)
at com.databricks.sql.rocksdb.CloudRocksDB.recordOperation(CloudRocksDB.scala:52)
at com.databricks.sql.rocksdb.CloudRocksDB.recordRocksDBOperation(CloudRocksDB.scala:542)
at com.databricks.sql.rocksdb.CloudRocksDB.$anonfun$open$1(CloudRocksDB.scala:427)
at com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:377)
at com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:363)
at com.databricks.spark.util.SparkDatabricksProgressReporter$.withStatusCode(ProgressReporter.scala:34)
at com.databricks.sql.rocksdb.CloudRocksDB.open(CloudRocksDB.scala:427)
at com.databricks.sql.rocksdb.CloudRocksDB.<init>(CloudRocksDB.scala:80)
at com.databricks.sql.rocksdb.CloudRocksDB$.open(CloudRocksDB.scala:595)
at com.databricks.sql.fileNotification.autoIngest.CloudFilesSource.<init>(CloudFilesSource.scala:82)
at com.databricks.sql.fileNotification.autoIngest.CloudFilesNotificationSource.<init>(CloudFilesNotificationSource.scala:44)
at com.databricks.sql.fileNotification.autoIngest.CloudFilesSourceProvider.createSource(CloudFilesSourceProvider.scala:172)
at org.apache.spark.sql.execution.datasources.DataSource.createSource(DataSource.scala:326)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.$anonfun$applyOrElse$1(MicroBatchExecution.scala:100)
at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.applyOrElse(MicroBatchExecution.scala:97)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.applyOrElse(MicroBatchExecution.scala:95)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:484)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:86)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:484)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:262)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:258)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:460)
at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:428)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.planQuery(MicroBatchExecution.scala:95)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.logicalPlan$lzycompute(MicroBatchExecution.scala:165)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.logicalPlan(MicroBatchExecution.scala:165)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:349)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:341)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:268)
Caused by: java.io.FileNotFoundException: No such file or directory: s3://*****/******/*******/checkpoint/sources/0/rocksdb/SSTs/******.sst
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3254)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3137)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3076)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:337)
at org.apache.hadoop.fs.FileUtil.copy(FileUtil.java:289)
at org.apache.hadoop.fs.FileSystem.copyToLocalFile(FileSystem.java:2034)
at org.apache.hadoop.fs.FileSystem.copyToLocalFile(FileSystem.java:2003)
at org.apache.hadoop.fs.FileSystem.copyToLocalFile(FileSystem.java:1979)
at com.databricks.sql.streaming.state.RocksDBFileManager.$anonfun$loadImmutableFilesFromDbfs$6(RocksDBFileManager.scala:442)
at com.databricks.sql.streaming.state.RocksDBFileManager.$anonfun$loadImmutableFilesFromDbfs$6$adapted(RocksDBFileManager.scala:433)
at org.apache.spark.util.ThreadUtils$.$anonfun$parallelMap$2(ThreadUtils.scala:397)
at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
at scala.util.Success.$anonfun$map$1(Try.scala:255)
at scala.util.Success.map(Try.scala:213)
at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.$anonfun$run$1(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:68)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured$(SparkThreadLocalForwardingThreadPoolExecutor.scala:54)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:101)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.run(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: No such file or directory: s3://*****/******/*******/checkpoint/sources/0/rocksdb/SSTs/******.sst
โ10-12-2021 06:46 AM
You can remove that folder so it will be recreated automatically.
Additionally every new job run should have new (or just empty) checkpoint location.
You can add in your code before running streaming:
dbutils.fs.rm(checkpoint_path, True)
Additionally you can verify that location for example by using "Data" icon in left menu:
โ09-29-2021 10:31 AM
Hi @Borislav Blagoevโ ,
Can you try to verify if this path exists? do
%fs ls <path_to_check_point> for example (s3://*****/******/*******/checkpoint/sources/0/rocksdb/SSTs/)
Are you able to list anything in this path? is this the first time you use this checkpoint? or it was working fine in the past?
โ10-05-2021 03:09 AM
It worked fine in the past .
But right now it can't find this ".sst" file
โ10-12-2021 12:58 AM
@Borislav Blagoevโ , Were there any changes in the code or on any spark configurations?
โ10-12-2021 06:46 AM
You can remove that folder so it will be recreated automatically.
Additionally every new job run should have new (or just empty) checkpoint location.
You can add in your code before running streaming:
dbutils.fs.rm(checkpoint_path, True)
Additionally you can verify that location for example by using "Data" icon in left menu:
Join a Regional User Group to connect with local Databricks users. Events will be happening in your city, and you wonโt want to miss the chance to attend and share knowledge.
If there isnโt a group near you, start one and help create a community that brings people together.
Request a New Group