I have a delta live table that I'm trying to run GroupBy on, but getting an error: "RuntimeError: Query function must return either a Spark or Koalas DataFrame".
Here is my code:
@dlt.table
def groups_hierarchy():
df = dlt.read_stream("groups_hierarchy_vw")
return(df
.select("id","name",split("path","/").alias("groups_in_path"),posexplode(split("path","/")).alias("pos","value"))
.drop("val")
.select("id","name",concat(lit("group"),"pos").alias("group_name"),expr("groups_in_path[pos]").alias("val"))
.groupBy([df.id, df.name])
Edit:
Something as simple as the following works just fine (you will notice I I am now reading a regular table and not a stream, just for testing purposes):
@dlt.table
def groups_hierarchy():
return dlt.read("streaming_silver").groupBy("id").count()
And it works fine when I apply my select statements and transformations, but the absolute last .groupBy() seems to convert it to a non Spark/Koalas DataFrame
@dlt.table
def groups_hierarchy():
return dlt.read("streaming_silver").select("id","name",split("path","/").alias("groups_in_path"),posexplode(split("path","/")).alias("pos","value")).drop("val").select("id","name",concat(lit("group"),"pos").alias("group_name"),expr("groups_in_path[pos]").alias("val")).groupBy("id")