<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Performance Issue : Create DELTA table form 2 TB PARQUET file in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10405#M5592</link>
    <description>&lt;P&gt;Please use COPY INTO (first create an empty delta table) or CONVERT TO DELTA instead of CTAS it will be much more faster, and it process will be auto-optimized.&lt;/P&gt;</description>
    <pubDate>Tue, 31 Jan 2023 18:58:05 GMT</pubDate>
    <dc:creator>Hubert-Dudek</dc:creator>
    <dc:date>2023-01-31T18:58:05Z</dc:date>
    <item>
      <title>Performance Issue : Create DELTA table form 2 TB PARQUET file</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10402#M5589</link>
      <description>&lt;P&gt;We are trying to create a DELTA table (CTAS statement) from 2 TB PARQUET file and its taking huge amount of time around 12~ hrs.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;is it normal.? What are option to tune/optimize this ? are we doing anything wrong&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Cluster : Interactive/30 Cores / 320 GB Memory / 4 workers&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 31 Jan 2023 16:08:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10402#M5589</guid>
      <dc:creator>KuldeepChitraka</dc:creator>
      <dc:date>2023-01-31T16:08:58Z</dc:date>
    </item>
    <item>
      <title>Re: Performance Issue : Create DELTA table form 2 TB PARQUET file</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10403#M5590</link>
      <description>&lt;P&gt;@Kuldeep Chitrakar​&amp;nbsp;- Please try to evaluate(explain plan) the physical plan on the CTAS query before creating the table.  Below are a few things that can be validated before turning the cluster size.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;validate the join conditions used in CTAS query. &lt;/LI&gt;&lt;LI&gt;will a plain select query work? &lt;/LI&gt;&lt;LI&gt;Tuning spark.sql.shuffle.partitions to see if more number of tasks are spun in parallel to reduce the time taken. &lt;/LI&gt;&lt;LI&gt;Is there a skew in the join? &lt;/LI&gt;&lt;LI&gt;will AQE config help? (https://docs.databricks.com/optimizations/aqe.html)&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 31 Jan 2023 18:01:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10403#M5590</guid>
      <dc:creator>shan_chandra</dc:creator>
      <dc:date>2023-01-31T18:01:07Z</dc:date>
    </item>
    <item>
      <title>Re: Performance Issue : Create DELTA table form 2 TB PARQUET file</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10404#M5591</link>
      <description>&lt;P&gt;I do not have experience with  2TB dataset but I recommend you check it out:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt; spark.sql.shuffle.partitions ( doc examples: &lt;A href="https://spark.apache.org/docs/latest/sql-performance-tuning.html" alt="https://spark.apache.org/docs/latest/sql-performance-tuning.html" target="_blank"&gt;Link 1&lt;/A&gt;, &lt;A href="https://sparkbyexamples.com/spark/difference-between-spark-sql-shuffle-partitions-and-spark-default-parallelism/" alt="https://sparkbyexamples.com/spark/difference-between-spark-sql-shuffle-partitions-and-spark-default-parallelism/" target="_blank"&gt;Link 2 &lt;/A&gt;)&lt;/LI&gt;&lt;LI&gt; &lt;A href="https://docs.databricks.com/delta/tune-file-size.html" alt="https://docs.databricks.com/delta/tune-file-size.html" target="_blank"&gt;Tune file size &lt;/A&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Can you share with us a screen from SPARK UI for CTAS statement ( SPARK UI -&amp;gt;STAGES -&amp;gt; select CTAS -&amp;gt; Summary metrics and Aggregated metrics )?&lt;/P&gt;&lt;P&gt;Can you check the size of the parquet files created under the delta table.?&lt;/P&gt;</description>
      <pubDate>Tue, 31 Jan 2023 18:18:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10404#M5591</guid>
      <dc:creator>Cami</dc:creator>
      <dc:date>2023-01-31T18:18:16Z</dc:date>
    </item>
    <item>
      <title>Re: Performance Issue : Create DELTA table form 2 TB PARQUET file</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10405#M5592</link>
      <description>&lt;P&gt;Please use COPY INTO (first create an empty delta table) or CONVERT TO DELTA instead of CTAS it will be much more faster, and it process will be auto-optimized.&lt;/P&gt;</description>
      <pubDate>Tue, 31 Jan 2023 18:58:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issue-create-delta-table-form-2-tb-parquet-file/m-p/10405#M5592</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2023-01-31T18:58:05Z</dc:date>
    </item>
  </channel>
</rss>

