Hello everyone,
When I run the set up for lesson 4.2 Providing Options for External Sources, I get the below error:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8491.0 failed 4 times, most recent failure: Lost task 0.3 in stage 8491.0 (TID 81305, 10.33.185.79, executor 23): java.sql.SQLException: [SQLITE_ERROR] SQL error or missing database (no such table: users)
Py4JJavaError Traceback (most recent call last)
<command-176215> in <module>
2 DA.init()
3 install_eltwss_datasets(reinstall=False)
----> 4 load_eltwss_external_tables()
5 DA.conclude_setup()
<command-180667> in load_eltwss_external_tables()
30 .option("url", f"jdbc:sqlite:/{DA.username}_ecommerce.db")
31 .option("dbtable", "users") # The table name in sqllight
---> 32 .mode("overwrite")
33 .save()
34 )
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
823 self.format(format)
824 if path is None:
--> 825 self._jwrite.save()
826 else:
827 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
125 def deco(*a, **kw):
126 try:
--> 127 return f(*a, **kw)
128 except py4j.protocol.Py4JJavaError as e:
129 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
I think the issue may lie in the below functions which I believe are called by the setup command.
def copy_source_dataset(src_path, dst_path, format, name):
import time
start = int(time.time())
print(f"Creating the {name} dataset", end="...")
dbutils.fs.cp(src_path, dst_path, True)
total = spark.read.format(format).load(dst_path).count()
print(f"({int(time.time())-start} seconds / {total:,} records)")
def load_eltwss_external_tables():
copy_source_dataset(f"{DA.paths.datasets}/raw/sales-csv",
f"{DA.paths.working_dir}/sales-csv", "csv", "sales-csv")
import time
start = int(time.time())
print(f"Creating the users table", end="...")
# REFACTORING - Making lesson-specific copy
dbutils.fs.cp(f"{DA.paths.datasets}/raw/users-historical",
f"{DA.paths.working_dir}/users-historical", True)
# https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
(spark.read
.format("parquet")
.load(f"{DA.paths.working_dir}/users-historical")
.repartition(1)
.write
.format("org.apache.spark.sql.jdbc")
.option("url", f"jdbc:sqlite:/{DA.username}_ecommerce.db")
.option("dbtable", "users") # The table name in sqllight
.mode("overwrite")
.save()
)
total = spark.read.parquet(f"{DA.paths.working_dir}/users-historical").count()
print(f"({int(time.time())-start} seconds / {total:,} records)")
I can see that this was raised and seemingly resolved here, but the solution is not shared and newer posters with the same issue have gone unanswered.
I have raised a support ticket but I'm not getting a response.
- I have tried removing everything related to the course from databricks and then adding the course repo again making sure I'm using the latest release.
- I'm tried to fix the code myself but with no luck as I thought it might be a typo or similar.
- My company is still on databricks runtime 7.3, could this be related?
Any guidance would be much appreciated.