<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Range join hint does not help in faster execution of spark sql in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121734#M46532</link>
    <description>&lt;DIV&gt;Spark SQL execution did not complete even after 12 hours, i ran it on i3.xlarge with 4 worker nodes.&lt;/DIV&gt;&lt;DIV&gt;only two worker nodes showed as running, with CPU at 100%&lt;/DIV&gt;&lt;DIV&gt;what should i do differently?&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;--SQL&lt;/DIV&gt;&lt;DIV&gt;INSERT into&amp;nbsp; attribute_results&lt;/DIV&gt;&lt;DIV&gt;...&lt;/DIV&gt;&lt;DIV&gt;SELECT&amp;nbsp; /*+ BROADCAST(t) RANGE_JOIN(ta, 46) */&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; ..,&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; ..,&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; .. &amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;from&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;(select * from transactions where txn_date ='2025-05-01'&amp;nbsp; ) t&lt;/DIV&gt;&lt;DIV&gt;INNER JOIN transaction_attributes ta&lt;/DIV&gt;&lt;DIV&gt;ON t.txn_date BETWEEN ta.analysis_start_date AND ta.analysis_end_date;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transactions;&lt;/DIV&gt;&lt;DIV&gt;2,782,521&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transactions where txn_date ='2025-05-01' ;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;92,387&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transaction_attributes ta&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;where to_date('2025-05-01') BETWEEN ta.analysis_start_date AND ta.analysis_end_date&lt;/DIV&gt;&lt;DIV&gt;43,589,999&lt;/DIV&gt;</description>
    <pubDate>Fri, 13 Jun 2025 18:24:48 GMT</pubDate>
    <dc:creator>ashokv</dc:creator>
    <dc:date>2025-06-13T18:24:48Z</dc:date>
    <item>
      <title>Range join hint does not help in faster execution of spark sql</title>
      <link>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121734#M46532</link>
      <description>&lt;DIV&gt;Spark SQL execution did not complete even after 12 hours, i ran it on i3.xlarge with 4 worker nodes.&lt;/DIV&gt;&lt;DIV&gt;only two worker nodes showed as running, with CPU at 100%&lt;/DIV&gt;&lt;DIV&gt;what should i do differently?&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;--SQL&lt;/DIV&gt;&lt;DIV&gt;INSERT into&amp;nbsp; attribute_results&lt;/DIV&gt;&lt;DIV&gt;...&lt;/DIV&gt;&lt;DIV&gt;SELECT&amp;nbsp; /*+ BROADCAST(t) RANGE_JOIN(ta, 46) */&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; ..,&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; ..,&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; .. &amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;from&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;(select * from transactions where txn_date ='2025-05-01'&amp;nbsp; ) t&lt;/DIV&gt;&lt;DIV&gt;INNER JOIN transaction_attributes ta&lt;/DIV&gt;&lt;DIV&gt;ON t.txn_date BETWEEN ta.analysis_start_date AND ta.analysis_end_date;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transactions;&lt;/DIV&gt;&lt;DIV&gt;2,782,521&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transactions where txn_date ='2025-05-01' ;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;92,387&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;select count(1) from transaction_attributes ta&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;where to_date('2025-05-01') BETWEEN ta.analysis_start_date AND ta.analysis_end_date&lt;/DIV&gt;&lt;DIV&gt;43,589,999&lt;/DIV&gt;</description>
      <pubDate>Fri, 13 Jun 2025 18:24:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121734#M46532</guid>
      <dc:creator>ashokv</dc:creator>
      <dc:date>2025-06-13T18:24:48Z</dc:date>
    </item>
    <item>
      <title>Re: Range join hint does not help in faster execution of spark sql</title>
      <link>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121790#M46551</link>
      <description>&lt;P&gt;can you share the result of the below query ?&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;select count(1) from transaction_attributes where&amp;nbsp;analysis_start_date =&amp;nbsp;'2025-05-01' and&amp;nbsp;analysis_end_date =&amp;nbsp;'2025-05-01' ,&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;If it has multiple entries , the join condition will lead to cross join and hence the spark job is running for ever&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 14 Jun 2025 19:18:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121790#M46551</guid>
      <dc:creator>saiprasadambati</dc:creator>
      <dc:date>2025-06-14T19:18:08Z</dc:date>
    </item>
    <item>
      <title>Re: Range join hint does not help in faster execution of spark sql</title>
      <link>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121802#M46555</link>
      <description>&lt;P&gt;Yes, it has multiple entries and i expect it that way. cross join is also expected.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;select count(1) from transaction_attributes where&amp;nbsp;analysis_start_date =&amp;nbsp;'2025-05-01' and&amp;nbsp;analysis_end_date =&amp;nbsp;'2025-05-01'&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;338489&lt;/P&gt;</description>
      <pubDate>Sun, 15 Jun 2025 08:45:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/range-join-hint-does-not-help-in-faster-execution-of-spark-sql/m-p/121802#M46555</guid>
      <dc:creator>ashokv</dc:creator>
      <dc:date>2025-06-15T08:45:27Z</dc:date>
    </item>
  </channel>
</rss>

