<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: slow running query in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16287#M10488</link>
    <description>&lt;P&gt;you can use &lt;B&gt;ShuffleHashJoin &lt;/B&gt;to improve&lt;/P&gt;</description>
    <pubDate>Sat, 17 Dec 2022 07:41:03 GMT</pubDate>
    <dc:creator>sher</dc:creator>
    <dc:date>2022-12-17T07:41:03Z</dc:date>
    <item>
      <title>slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16284#M10485</link>
      <description>&lt;P&gt; Hi All, &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I would you to get some ideas on how to improve performance on a data frame with around 10M rows. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;adls- gen2&lt;/P&gt;&lt;P&gt;df1 =source1 , format , parquet  ( 10 m)&lt;/P&gt;&lt;P&gt;df2 =source2 , format , parquet  ( 10 m)&lt;/P&gt;&lt;P&gt;df = join df1 and df2 type =inner join &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;df.count() is taking for ever. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;trying to join the above sources and aggregate them and write back to adls .&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 16 Dec 2022 21:10:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16284#M10485</guid>
      <dc:creator>joakon</dc:creator>
      <dc:date>2022-12-16T21:10:40Z</dc:date>
    </item>
    <item>
      <title>Re: slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16285#M10486</link>
      <description>&lt;P&gt;@raghu maremanda​&amp;nbsp;It's hard to provide an answer without having more info. Can you add the actual code used in the join, as well as the total data size, &amp;amp; cluster configuration (note types &amp;amp; number of nodes)&lt;/P&gt;</description>
      <pubDate>Fri, 16 Dec 2022 21:40:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16285#M10486</guid>
      <dc:creator>LandanG</dc:creator>
      <dc:date>2022-12-16T21:40:33Z</dc:date>
    </item>
    <item>
      <title>Re: slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16286#M10487</link>
      <description>&lt;P&gt;Which size of parquet file, if is too small you can try with pandas then compare to pyspark&lt;/P&gt;</description>
      <pubDate>Sat, 17 Dec 2022 07:01:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16286#M10487</guid>
      <dc:creator>labtech</dc:creator>
      <dc:date>2022-12-17T07:01:16Z</dc:date>
    </item>
    <item>
      <title>Re: slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16287#M10488</link>
      <description>&lt;P&gt;you can use &lt;B&gt;ShuffleHashJoin &lt;/B&gt;to improve&lt;/P&gt;</description>
      <pubDate>Sat, 17 Dec 2022 07:41:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16287#M10488</guid>
      <dc:creator>sher</dc:creator>
      <dc:date>2022-12-17T07:41:03Z</dc:date>
    </item>
    <item>
      <title>Re: slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16288#M10489</link>
      <description>&lt;P&gt;yeah this is easy you can do some performance tuning in your cluster and it will work, you can use auto broadcast join configuration or other where you can set up your performance tuning &lt;/P&gt;</description>
      <pubDate>Sun, 18 Dec 2022 06:25:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16288#M10489</guid>
      <dc:creator>Aviral-Bhardwaj</dc:creator>
      <dc:date>2022-12-18T06:25:07Z</dc:date>
    </item>
    <item>
      <title>Re: slow running query</title>
      <link>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16289#M10490</link>
      <description>&lt;P&gt;hey @raghu maremanda​&amp;nbsp;did you get any answer if yes ,please update here, by that other people can also get the solution&lt;/P&gt;</description>
      <pubDate>Sat, 24 Dec 2022 04:37:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/slow-running-query/m-p/16289#M10490</guid>
      <dc:creator>Aviral-Bhardwaj</dc:creator>
      <dc:date>2022-12-24T04:37:33Z</dc:date>
    </item>
  </channel>
</rss>

