Hello,
I have an Azure sql warehouse serverless instance that I can connect to using databricks-sql-connector. But, when I try to use pyspark and jdbc driver url, I can't read or write.
See my code below
def get_jdbc_url():
# Define your Databricks parameters
server_hostname, http_path, access_token = get_connection_configs()
default_catalog = "researchers"
# Build the Spark JDBC URL
jdbc_url = f"jdbc:databricks://{server_hostname}:443;httpPath={http_path};AuthMech=3;UID=token;PWD={access_token};ConnCatalog={default_catalog}"
return jdbc_url
def get_spark() -> SparkSession:
# Initialize SparkSession
conf = SparkConf()
# unfortunately, there is no automatic way for pyspark to download the jar itself
# TODO set to your own jar path
conf.set("spark.jars",
"/home/username/repos/uwcip-research/code-examples/DatabricksJDBC42-2.6.38.1068/DatabricksJDBC42.jar")
conf.set("spark.driver.extraClassPath",
"/home/username/repos/uwcip-research/code-examples/DatabricksJDBC42-2.6.38.1068/DatabricksJDBC42.jar")
conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark = SparkSession.builder \
.appName("connecting to databricks") \
.config(conf=conf) \
.getOrCreate()
return spark
def read_table():
spark = get_spark()
jdbc_url = get_jdbc_url()
dbtable = "researchers.dev.test_transcripts"
df = spark.read \
.format("jdbc") \
.option("url", jdbc_url) \
.option("dbtable", dbtable) \
.load()
top5 = df.head(5)
print(top5)
return
Below is the error and read results I got when call read_table(). See that it simply returned column names 5 times. When I call df.printSchema(), it works fine.
ERROR StatusLogger Unrecognized conversion specifier [msg] starting at position 54 in conversion pattern.
ERROR StatusLogger Unrecognized format specifier [n]
ERROR StatusLogger Unrecognized conversion specifier [n] starting at position 56 in conversion pattern.
[Row(video_name='video_name', video_transcript='video_transcript'), Row(video_name='video_name', video_transcript='video_transcript'), Row(video_name='video_name', video_transcript='video_transcript'), Row(video_name='video_name', video_transcript='video_transcript'), Row(video_name='video_name', video_transcript='video_transcript')]
Additionally, probably unrelated logger error
ERROR StatusLogger Unable to create Lookup for bundle
java.lang.ClassCastException: class org.apache.logging.log4j.core.lookup.ResourceBundleLookup
at java.base/java.lang.Class.asSubclass(Class.java:3640)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.lookup.Interpolator.<init>(Interpolator.java:84)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.lookup.Interpolator.<init>(Interpolator.java:105)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.config.AbstractConfiguration.<init>(AbstractConfiguration.java:135)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.config.NullConfiguration.<init>(NullConfiguration.java:32)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.LoggerContext.<clinit>(LoggerContext.java:74)
...
ERROR StatusLogger Unable to create Lookup for ctx
java.lang.ClassCastException: class org.apache.logging.log4j.core.lookup.ContextMapLookup
at java.base/java.lang.Class.asSubclass(Class.java:3640)
at com.databricks.client.jdbc42.internal.apache.logging.log4j.core.lookup.Interpolator.<init>(Interpolator.java:84)