Hi,
I create a table using DLT pipeline (triggered once). In the ETL process, I add a new column to the table with Null values by:
output = output.withColumn('Indicator_Latest_Value_Date', F.lit(None))
Pipeline works and I don't get any error. But, when I get query from the resulting table by:
stage_df = spark.sql('select * from marketing.stage_macroeconomics_manual_indicators')
display(stage_df)
I get the following error (I couldn't copy the whole message, too long):
java.lang.IllegalStateException: Couldn't find Indicator_Latest_Value_Date#9439 in [Country#9420,Indicator_Name#9421,Indicator_Source#9422,Indicator_Source_URL#9423,Indicator_Unit#9424,Indicator_Category_Group#9425,Indicator_Adjustment#9426,Indicator_Frequency#9427,Calendar_Year#9428,Month_Number#9429,Indicator_Date#9430,Indicator_Value#9431,Excel_Input_Column_Names#9432,Excel_Input_File#9433,Excel_Input_Sheet#9434,Excel_Ingest_Datetime#9435,ETL_InputFile#9436,ETL_LoadDate#9437,ETL_SourceFile_Date#9438,Source_System#9440,Indicator_Title#9441]
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-1891366018233962> in <cell line: 2>()
1 stage_df = spark.sql('select * from marketing.stage_macroeconomics_manual_indicators')
----> 2 display(stage_df)
/databricks/python_shell/dbruntime/display.py in display(self, input, *args, **kwargs)
81 raise Exception('Triggers can only be set for streaming queries.')
82
---> 83 self.add_custom_display_data("table", input._jdf)
84
85 elif isinstance(input, list):
/databricks/python_shell/dbruntime/display.py in add_custom_display_data(self, data_type, data)
34 def add_custom_display_data(self, data_type, data):
35 custom_display_key = str(uuid.uuid4())
---> 36 return_code = self.entry_point.addCustomDisplayData(custom_display_key, data_type, data)
37 ip_display({
38 "application/vnd.databricks.v1+display": custom_display_key,
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
194 def deco(*a: Any, **kw: Any) -> Any:
195 try:
--> 196 return f(*a, **kw)
197 except Py4JJavaError as e:
198 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling t.addCustomDisplayData.
: java.lang.IllegalStateException: Couldn't find Indicator_Latest_Value_Date#9439 in [Country#9420,Indicator_Name#9421,Indicator_Source#9422,Indicator_Source_URL#9423,Indicator_Unit#9424,Indicator_Category_Group#9425,Indicator_Adjustment#9426,Indicator_Frequency#9427,Calendar_Year#9428,Month_Number#9429,Indicator_Date#9430,Indicator_Value#9431,Excel_Input_Column_Names#9432,Excel_Input_File#9433,Excel_Input_Sheet#9434,Excel_Ingest_Datetime#9435,ETL_InputFile#9436,ETL_LoadDate#9437,ETL_SourceFile_Date#9438,Source_System#9440,Indicator_Title#9441]
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:80)
at org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:73)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:99)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:456)
at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:73)
at org.apache.spark.sql.catalyst.expressions.BindReferences$.$anonfun$bindReferences$1(BoundAttribute.scala:94)
at scala.collection.immutable.List.map(List.scala:297)
at org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReferences(BoundAttribute.scala:94)
at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:70)
at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:197)
at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:152)
at org.apache.spark.sql.execution.ColumnarToRowExec.consume(Columnar.scala:71)
at org.apache.spark.sql.execution.ColumnarToRowExec.doProduce(Columnar.scala:202)
at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:98)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:269)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:265)
at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:93)
at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:93)
at org.apache.spark.sql.execution.ColumnarToRowExec.produce(Columnar.scala:71)
at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:55)
at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:98)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:269)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:265)
at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:93)
at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:93)
at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:45)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:661)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:724)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:225)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:269)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:265)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:221)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:97)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:108)
at
Note that when I set the column value to a fixed string value or space like:
output = output.withColumn('Indicator_Latest_Value_Date', F.lit(''))
I don't get any error by getting query.
What is the possible reason for this error and how I can solve it? Thanks for your help.