<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Infinity load execution in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/infinity-load-execution/m-p/103338#M41407</link>
    <description>&lt;P&gt;Thank you for your question! To optimize your Delta Lake write process:&lt;/P&gt;
&lt;P&gt;Disable Overhead Options: Avoid overwriteSchema and mergeSchema unless necessary. Use:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.write.format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Increase Parallelism: Use repartition to ensure better resource utilization:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.repartition(200).write.format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Partition Data: Write data using partitions for better scalability:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.write.partitionBy("column_name").format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Optimize Table Post-write: Run Delta optimizations:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;OPTIMIZE delta.`&amp;lt;sink_path&amp;gt;`;
VACUUM delta.`&amp;lt;sink_path&amp;gt;`;&lt;/LI-CODE&gt;
&lt;P&gt;Scale Cluster: Use more or larger worker nodes.&lt;/P&gt;
&lt;P&gt;Let me know if you need clarification!&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Fri, 27 Dec 2024 18:06:50 GMT</pubDate>
    <dc:creator>VZLA</dc:creator>
    <dc:date>2024-12-27T18:06:50Z</dc:date>
    <item>
      <title>Infinity load execution</title>
      <link>https://community.databricks.com/t5/data-engineering/infinity-load-execution/m-p/86318#M37311</link>
      <description>&lt;P&gt;&lt;SPAN&gt;I am experiencing performance issues when loading a table with 50 million rows into Delta Lake on AWS using Databricks. Despite successfully handling other larger tables, this especific table/process takes hours and doesn't finish. Here's the command I am using:&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;(df .write .option('overwriteSchema', 'true') .option('mergeSchema', 'true') .save(path=sink, format='delta', mode='overwrite')) &lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Could you please advise on how to resolve this or optimize the process? Thank you. Best regards, Dener Botta Escaliante Moreira&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 29 Aug 2024 11:49:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/infinity-load-execution/m-p/86318#M37311</guid>
      <dc:creator>dener</dc:creator>
      <dc:date>2024-08-29T11:49:38Z</dc:date>
    </item>
    <item>
      <title>Re: Infinity load execution</title>
      <link>https://community.databricks.com/t5/data-engineering/infinity-load-execution/m-p/103338#M41407</link>
      <description>&lt;P&gt;Thank you for your question! To optimize your Delta Lake write process:&lt;/P&gt;
&lt;P&gt;Disable Overhead Options: Avoid overwriteSchema and mergeSchema unless necessary. Use:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.write.format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Increase Parallelism: Use repartition to ensure better resource utilization:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.repartition(200).write.format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Partition Data: Write data using partitions for better scalability:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;df.write.partitionBy("column_name").format("delta").mode("overwrite").save(sink)&lt;/LI-CODE&gt;
&lt;P&gt;Optimize Table Post-write: Run Delta optimizations:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;OPTIMIZE delta.`&amp;lt;sink_path&amp;gt;`;
VACUUM delta.`&amp;lt;sink_path&amp;gt;`;&lt;/LI-CODE&gt;
&lt;P&gt;Scale Cluster: Use more or larger worker nodes.&lt;/P&gt;
&lt;P&gt;Let me know if you need clarification!&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 27 Dec 2024 18:06:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/infinity-load-execution/m-p/103338#M41407</guid>
      <dc:creator>VZLA</dc:creator>
      <dc:date>2024-12-27T18:06:50Z</dc:date>
    </item>
  </channel>
</rss>

