<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic JDBC / SSH-tunnel to connect to PostgreSQL not working on multi-node clusters in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/jdbc-ssh-tunnel-to-connect-to-postgresql-not-working-on-multi/m-p/66309#M33082</link>
    <description>&lt;P&gt;Hi everybody,&lt;BR /&gt;&lt;BR /&gt;I'm trying to setup a connection between Databricks' Notebooks and an external PostgreSQL database through a SSH-tunnel. On a single-node cluster, this is working perfectly fine. However, when this is ran on a multi-node cluster, this code fails. I assume it must be because of the localhost reference in the JDBC url which is interpreted by the executors on a multi-node cluster. However, inserting the hostname of the driver in that part of the URL with something like socket.getHostname() is not working as well, as was explained here:&amp;nbsp;&lt;A href="https://stackoverflow.com/questions/68278369/how-to-use-pyspark-to-read-a-mysql-database-using-a-ssh-tunnel" target="_blank" rel="noopener"&gt;https://stackoverflow.com/questions/68278369/how-to-use-pyspark-to-read-a-mysql-database-using-a-ssh-tunnel&lt;/A&gt;&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Does anyone know how to make this work on multi-node clusters as well?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;# Set up SSH tunnel
with SSHTunnelForwarder(
    ("data.host", 22),
    ssh_username="databricks",
    ssh_pkey=paramiko.RSAKey.from_private_key(private_key),
    remote_bind_address=("127.0.0.1", 5432),
    local_bind_address=("localhost", 5432)
) as tunnel:
    print(tunnel.is_active)
    jdbc_url = f"jdbc:postgresql://localhost:{tunnel.local_bind_port}/{database}"
    print(jdbc_url)
    df = spark.read \
        .format("jdbc") \
        .option("url", jdbc_url) \
        .option("user", username) \
        .option("password", password) \
        .option("driver", driver) \
        .option("dbtable", f"(SELECT * FROM schema_test.table_test) as tmp") \
        .load()
    display(df)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 16 Apr 2024 07:24:27 GMT</pubDate>
    <dc:creator>laurenskuiper97</dc:creator>
    <dc:date>2024-04-16T07:24:27Z</dc:date>
    <item>
      <title>JDBC / SSH-tunnel to connect to PostgreSQL not working on multi-node clusters</title>
      <link>https://community.databricks.com/t5/data-engineering/jdbc-ssh-tunnel-to-connect-to-postgresql-not-working-on-multi/m-p/66309#M33082</link>
      <description>&lt;P&gt;Hi everybody,&lt;BR /&gt;&lt;BR /&gt;I'm trying to setup a connection between Databricks' Notebooks and an external PostgreSQL database through a SSH-tunnel. On a single-node cluster, this is working perfectly fine. However, when this is ran on a multi-node cluster, this code fails. I assume it must be because of the localhost reference in the JDBC url which is interpreted by the executors on a multi-node cluster. However, inserting the hostname of the driver in that part of the URL with something like socket.getHostname() is not working as well, as was explained here:&amp;nbsp;&lt;A href="https://stackoverflow.com/questions/68278369/how-to-use-pyspark-to-read-a-mysql-database-using-a-ssh-tunnel" target="_blank" rel="noopener"&gt;https://stackoverflow.com/questions/68278369/how-to-use-pyspark-to-read-a-mysql-database-using-a-ssh-tunnel&lt;/A&gt;&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Does anyone know how to make this work on multi-node clusters as well?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;# Set up SSH tunnel
with SSHTunnelForwarder(
    ("data.host", 22),
    ssh_username="databricks",
    ssh_pkey=paramiko.RSAKey.from_private_key(private_key),
    remote_bind_address=("127.0.0.1", 5432),
    local_bind_address=("localhost", 5432)
) as tunnel:
    print(tunnel.is_active)
    jdbc_url = f"jdbc:postgresql://localhost:{tunnel.local_bind_port}/{database}"
    print(jdbc_url)
    df = spark.read \
        .format("jdbc") \
        .option("url", jdbc_url) \
        .option("user", username) \
        .option("password", password) \
        .option("driver", driver) \
        .option("dbtable", f"(SELECT * FROM schema_test.table_test) as tmp") \
        .load()
    display(df)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 16 Apr 2024 07:24:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/jdbc-ssh-tunnel-to-connect-to-postgresql-not-working-on-multi/m-p/66309#M33082</guid>
      <dc:creator>laurenskuiper97</dc:creator>
      <dc:date>2024-04-16T07:24:27Z</dc:date>
    </item>
    <item>
      <title>Re: JDBC / SSH-tunnel to connect to PostgreSQL not working on multi-node clusters</title>
      <link>https://community.databricks.com/t5/data-engineering/jdbc-ssh-tunnel-to-connect-to-postgresql-not-working-on-multi/m-p/66400#M33102</link>
      <description>&lt;P&gt;I doubt it is possible.&lt;BR /&gt;The driver runs the program, and sends tasks to the executors.&amp;nbsp; But since creating the ssh tunnel is no spark task, I don't think it will be established on any executor.&lt;/P&gt;</description>
      <pubDate>Wed, 17 Apr 2024 07:33:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/jdbc-ssh-tunnel-to-connect-to-postgresql-not-working-on-multi/m-p/66400#M33102</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2024-04-17T07:33:03Z</dc:date>
    </item>
  </channel>
</rss>

