<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: CDC for Unstructured data in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/cdc-for-unstructured-data/m-p/95973#M4489</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/36892"&gt;@Phani1&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Handling CDC for unstructured data—such as audio, images, or video files—in Databricks involves efficiently detecting and processing changes to these files as they occur.&lt;/P&gt;&lt;P&gt;Here's how you can approach this:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Use Databricks Autoloader: Autoloader can incrementally and efficiently process new and changed files from cloud storage like Azure Data Lake. It uses a checkpointing mechanism to track which files have been processed.&lt;/LI&gt;&lt;LI&gt;Since delta format allows&amp;nbsp;binary data can be stored in a Delta table, you may read the files with its metadata attribures (modification timestamps, checksums) and store as Delta table. The files uploaded by the autoloader may be then compared against their checksums to recognize duplicates etc.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 24 Oct 2024 13:11:23 GMT</pubDate>
    <dc:creator>filipniziol</dc:creator>
    <dc:date>2024-10-24T13:11:23Z</dc:date>
    <item>
      <title>CDC for Unstructured data</title>
      <link>https://community.databricks.com/t5/get-started-discussions/cdc-for-unstructured-data/m-p/95969#M4488</link>
      <description>&lt;P&gt;Hi All,&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;how we can handle CDC for unstructured data in Databricks. What are some best practices we should follow to make this work effectively?&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Phani&lt;/P&gt;</description>
      <pubDate>Thu, 24 Oct 2024 12:48:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/cdc-for-unstructured-data/m-p/95969#M4488</guid>
      <dc:creator>Phani1</dc:creator>
      <dc:date>2024-10-24T12:48:44Z</dc:date>
    </item>
    <item>
      <title>Re: CDC for Unstructured data</title>
      <link>https://community.databricks.com/t5/get-started-discussions/cdc-for-unstructured-data/m-p/95973#M4489</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/36892"&gt;@Phani1&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Handling CDC for unstructured data—such as audio, images, or video files—in Databricks involves efficiently detecting and processing changes to these files as they occur.&lt;/P&gt;&lt;P&gt;Here's how you can approach this:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Use Databricks Autoloader: Autoloader can incrementally and efficiently process new and changed files from cloud storage like Azure Data Lake. It uses a checkpointing mechanism to track which files have been processed.&lt;/LI&gt;&lt;LI&gt;Since delta format allows&amp;nbsp;binary data can be stored in a Delta table, you may read the files with its metadata attribures (modification timestamps, checksums) and store as Delta table. The files uploaded by the autoloader may be then compared against their checksums to recognize duplicates etc.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 24 Oct 2024 13:11:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/cdc-for-unstructured-data/m-p/95973#M4489</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-10-24T13:11:23Z</dc:date>
    </item>
  </channel>
</rss>

