Hey there... I managed to query my data following this guide https://learn.microsoft.com/en-us/azure/databricks/dev-tools/python-sql-connector
using databricks sql
#!/usr/bin/env python3
from databricks import sql
with sql.connect(server_hostname = "adb-xxx.azuredatabricks.net",
http_path = "/sql/1.0/warehouses/xxx",
access_token = "xxx") as connection:
with connection.cursor() as cursor:
##https://learn.microsoft.com/en-us/azure/databricks/quer
cursor.execute("SELECT * FROM democatalog.users.people LIMIT 2")
result = cursor.fetchall()
for row in result:
print(row)
But how do I translate this into pyspark?
below is my first jumbled up attempt. clearly lots if stuff is still missing..
It took me a day to just find out how to include the missing jars. Now I don't know how connect and address
my people table.
#!/usr/bin/env python3
from pyspark.sql import SparkSession
from delta.tables import
additional_libraries = [
"io.delta:delta-core_2.12:2.4.0",
"com.databricks:spark-xml_2.12:0.17.0",
]
# Set up the SparkSession: https://docs.delta.io/latest/quick-start.html
spark = SparkSession.builder \
.appName("Databricks Table Query") \
.config("spark.??????", server_hostname) \
.config("spark.??????", access_token) \
.config("spark.??????", http_path) \
.config("spark.jars.packages", ",".join(additional_libraries)) \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
.getOrCreate()
# Specify the catalog, schema, and table
catalog = "democatalog"
schema = "users"
table = "people"
volume = 'sajt'
# ???....
# delta_table_identifier = f"{catalog}.{schema}.{table}"
# deltaTable = DeltaTable.forName(spark, delta_table_identifier)
# df = deltaTable.toDF()
# df.show()
Thanks.