<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Too many small files from updates in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107863#M42934</link>
    <description>&lt;P&gt;Hi ,&lt;/P&gt;&lt;P&gt;I am updating some data into a delta table , each time I&amp;nbsp; only need to&amp;nbsp; update&amp;nbsp; one row due to which after every update statement it is creating new file, How do I tackle this issue , it doesn't make sense to run optimize command after every update command&lt;/P&gt;</description>
    <pubDate>Thu, 30 Jan 2025 16:20:44 GMT</pubDate>
    <dc:creator>pradeepvatsvk</dc:creator>
    <dc:date>2025-01-30T16:20:44Z</dc:date>
    <item>
      <title>Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107863#M42934</link>
      <description>&lt;P&gt;Hi ,&lt;/P&gt;&lt;P&gt;I am updating some data into a delta table , each time I&amp;nbsp; only need to&amp;nbsp; update&amp;nbsp; one row due to which after every update statement it is creating new file, How do I tackle this issue , it doesn't make sense to run optimize command after every update command&lt;/P&gt;</description>
      <pubDate>Thu, 30 Jan 2025 16:20:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107863#M42934</guid>
      <dc:creator>pradeepvatsvk</dc:creator>
      <dc:date>2025-01-30T16:20:44Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107882#M42937</link>
      <description>&lt;P&gt;Usually this problem is solved with autooptimize property.&lt;/P&gt;&lt;P&gt;&lt;A href="https://docs.databricks.com/en/delta/tune-file-size.html#auto-compaction-for-delta-lake-on-databricks" target="_blank"&gt;https://docs.databricks.com/en/delta/tune-file-size.html#auto-compaction-for-delta-lake-on-databricks&lt;/A&gt;&lt;/P&gt;&lt;P&gt;For Managed tables this option is enabled by default&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 30 Jan 2025 17:11:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107882#M42937</guid>
      <dc:creator>JakubSkibicki</dc:creator>
      <dc:date>2025-01-30T17:11:28Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107923#M42950</link>
      <description>&lt;P&gt;Depending on your table settings, those may be log files or version use for Time Travel.&amp;nbsp; Unless you've mastered partitioning, you really shouldn't worry about the files and let the system do what it does.&lt;/P&gt;</description>
      <pubDate>Thu, 30 Jan 2025 20:21:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/107923#M42950</guid>
      <dc:creator>Rjdudley</dc:creator>
      <dc:date>2025-01-30T20:21:55Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108059#M42964</link>
      <description>&lt;P&gt;But it is making a performance head , every update command is taking more time than previous since it has to filter in more files&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 31 Jan 2025 11:30:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108059#M42964</guid>
      <dc:creator>pradeepvatsvk</dc:creator>
      <dc:date>2025-01-31T11:30:02Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108063#M42966</link>
      <description>&lt;P&gt;set following spark session properties and give a try:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;'spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite'&lt;/SPAN&gt;&lt;SPAN&gt;: &lt;/SPAN&gt;&lt;SPAN&gt;'true'&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;'spark.databricks.delta.optimizeWrite.enabled'&lt;/SPAN&gt;&lt;SPAN&gt;: &lt;/SPAN&gt;&lt;SPAN&gt;'true'&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;'spark.sql.shuffle.partitions': 'auto'&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Fri, 31 Jan 2025 12:15:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108063#M42966</guid>
      <dc:creator>saurabh18cs</dc:creator>
      <dc:date>2025-01-31T12:15:36Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108097#M42971</link>
      <description>&lt;P&gt;OK something isn't right, I work with massive datasets and this is not an issue for a single update.&amp;nbsp; If your architecture and Unity Catalog configuration is correct, and there's also not some weird bug, you should not be aware of the underlying files.&amp;nbsp; Are you working against the data files directly or are you querying the tables in Unity Catalog?&lt;/P&gt;</description>
      <pubDate>Fri, 31 Jan 2025 14:56:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108097#M42971</guid>
      <dc:creator>Rjdudley</dc:creator>
      <dc:date>2025-01-31T14:56:24Z</dc:date>
    </item>
    <item>
      <title>Re: Too many small files from updates</title>
      <link>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108174#M42987</link>
      <description>&lt;P&gt;If you performing 100s of update operations on the delta table, you can opt to run an optimize operation after a batch of 100 updates. There should be no significant performance issue up to 100 such updates&lt;/P&gt;</description>
      <pubDate>Fri, 31 Jan 2025 18:42:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/too-many-small-files-from-updates/m-p/108174#M42987</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2025-01-31T18:42:49Z</dc:date>
    </item>
  </channel>
</rss>

