<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Best approach for handling batch processess from cloud object storage. in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/best-approach-for-handling-batch-processess-from-cloud-object/m-p/89512#M37835</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88045"&gt;@alexandrexixe&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Are you building a production solution or you want to simply explore the data?&lt;BR /&gt;For something long-term I would recommend autoloader option.&amp;nbsp;&lt;BR /&gt;Having external tables you do not get the benefits of working with Delta tables: the queries will be slow, there will be no schema evolution, you won't have time travel etc.&lt;BR /&gt;Eventually there will be this single feature that you need, but it is not available when using external tables.&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 11 Sep 2024 19:26:10 GMT</pubDate>
    <dc:creator>filipniziol</dc:creator>
    <dc:date>2024-09-11T19:26:10Z</dc:date>
    <item>
      <title>Best approach for handling batch processess from cloud object storage.</title>
      <link>https://community.databricks.com/t5/data-engineering/best-approach-for-handling-batch-processess-from-cloud-object/m-p/89499#M37828</link>
      <description>&lt;P&gt;I'm working on a Databricks implementation project where external Kafka processes write JSON files to S3. I need to ingest these files daily, or in some cases every four hours, but I don't need to perform stream processing.&lt;/P&gt;&lt;P&gt;I'm considering two approaches to bring these files into a Delta Lake using Unit Catalog enviroment:&lt;/P&gt;&lt;P&gt;1 - &lt;STRONG&gt;Using Autoloader in batch mode&lt;/STRONG&gt;: I could use Autoloader in batch mode to bring these files directly into a Delta bronze layer.&lt;BR /&gt;&lt;BR /&gt;2- &lt;STRONG&gt;Creating external tables:&lt;/STRONG&gt; I could create external tables from these files and use them as a bronze layer.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;Do these approaches make sense?&lt;BR /&gt;What are the advantages and disadvantages of each?&lt;BR /&gt;Is there any other better aproach?&lt;/P&gt;</description>
      <pubDate>Wed, 11 Sep 2024 15:35:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-approach-for-handling-batch-processess-from-cloud-object/m-p/89499#M37828</guid>
      <dc:creator>alexandrexixe</dc:creator>
      <dc:date>2024-09-11T15:35:02Z</dc:date>
    </item>
    <item>
      <title>Re: Best approach for handling batch processess from cloud object storage.</title>
      <link>https://community.databricks.com/t5/data-engineering/best-approach-for-handling-batch-processess-from-cloud-object/m-p/89512#M37835</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88045"&gt;@alexandrexixe&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Are you building a production solution or you want to simply explore the data?&lt;BR /&gt;For something long-term I would recommend autoloader option.&amp;nbsp;&lt;BR /&gt;Having external tables you do not get the benefits of working with Delta tables: the queries will be slow, there will be no schema evolution, you won't have time travel etc.&lt;BR /&gt;Eventually there will be this single feature that you need, but it is not available when using external tables.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 11 Sep 2024 19:26:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-approach-for-handling-batch-processess-from-cloud-object/m-p/89512#M37835</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-09-11T19:26:10Z</dc:date>
    </item>
  </channel>
</rss>

