<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Filter only Delta tables from an S3 folders list in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7763#M3538</link>
    <description>&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;From a list of folders on s3, how can I filter which ones are Delta tables, without trying to read each one at a time?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Ovi&lt;/P&gt;</description>
    <pubDate>Tue, 14 Mar 2023 10:48:02 GMT</pubDate>
    <dc:creator>Ovi</dc:creator>
    <dc:date>2023-03-14T10:48:02Z</dc:date>
    <item>
      <title>Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7763#M3538</link>
      <description>&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;From a list of folders on s3, how can I filter which ones are Delta tables, without trying to read each one at a time?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Ovi&lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 10:48:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7763#M3538</guid>
      <dc:creator>Ovi</dc:creator>
      <dc:date>2023-03-14T10:48:02Z</dc:date>
    </item>
    <item>
      <title>Re: Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7764#M3539</link>
      <description>&lt;P&gt;Hello @Ovidiu Eremia​&amp;nbsp;,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;To filter which folders on S3 contain Delta tables, you can look for the specific files that are associated with Delta tables. Delta Lake stores its metadata in a hidden folder named &lt;/P&gt;&lt;P&gt;_delta_log, which is located at the root of the Delta table. So, you can check for this folder.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;In the below code, we first get the S3 bucket and the objects under the specified prefix. We then filter out only those objects that represent Delta tables by checking if their keys end with _delta_log/. Finally, we extract the folder names from the Delta object paths and print the list of folders that contain Delta tables.&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;import boto3
&amp;nbsp;
s3 = boto3.resource('s3')
bucket_name = 'your-bucket-name'&amp;nbsp;
prefix = 'path/to/folders'
&amp;nbsp;
# Get the S3 bucket and the objects under the specified prefix
bucket = s3.Bucket(bucket_name)
objects = bucket.objects.filter(Prefix=prefix)
&amp;nbsp;
# Filter out only the objects that represent Delta tables
delta_objects = [obj.key for obj in objects if obj.key.endswith('_delta_log/')]
&amp;nbsp;
# Extract the folder names from the Delta object paths
delta_folders = [obj.split('_delta_log/')[0] for obj in delta_objects]
&amp;nbsp;
print(delta_folders)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;References: &lt;A href="https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html" alt="https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html" target="_blank"&gt;https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Hope this helps.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks &amp;amp; Regards,&lt;/P&gt;&lt;P&gt;Nandini&lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 11:13:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7764#M3539</guid>
      <dc:creator>NandiniN</dc:creator>
      <dc:date>2023-03-14T11:13:52Z</dc:date>
    </item>
    <item>
      <title>Re: Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7765#M3540</link>
      <description>&lt;P&gt;Hi @Nandini N​,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for your answer which includes also an example.&lt;/P&gt;&lt;P&gt;But for my use case, I need to do it using Scala. &lt;/P&gt;&lt;P&gt;Could you please point me in the direct direction of how to implement this?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you again,&lt;/P&gt;&lt;P&gt;Ovi&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 12:15:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7765#M3540</guid>
      <dc:creator>Ovi</dc:creator>
      <dc:date>2023-03-14T12:15:49Z</dc:date>
    </item>
    <item>
      <title>Re: Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7766#M3541</link>
      <description>&lt;P&gt;This is an elegant one, it uses isDeltaTable() from DeltaTableUtils and dbutils.  &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;import org.apache.spark.sql.delta.DeltaTableUtils
&amp;nbsp;
val s3Path = "s3://my-bucket/my-folder"
&amp;nbsp;
// Get a list of all the folders in the S3 path
val folders = dbutils.fs.ls(s3Path).map(_.path)
&amp;nbsp;
// Filter out any folders that are Delta tables
val nonDeltaFolders = folders.filter(path =&amp;gt; !DeltaTableUtils.isDeltaTable(path))
&amp;nbsp;
// Print the resulting list of folders
nonDeltaFolders.foreach(println)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Note: this is also available in python (package DeltaTableUtils is DeltaTable)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks &amp;amp; Regards,&lt;/P&gt;&lt;P&gt;Nandini&lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 12:35:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7766#M3541</guid>
      <dc:creator>NandiniN</dc:creator>
      <dc:date>2023-03-14T12:35:16Z</dc:date>
    </item>
    <item>
      <title>Re: Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7767#M3542</link>
      <description>&lt;P&gt;Nice solution @Nandini N​&amp;nbsp;!&lt;/P&gt;&lt;P&gt;Thanks a lot for pointing it out to me!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Have a nice day,&lt;/P&gt;&lt;P&gt;Ovi&lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 12:48:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7767#M3542</guid>
      <dc:creator>Ovi</dc:creator>
      <dc:date>2023-03-14T12:48:34Z</dc:date>
    </item>
    <item>
      <title>Re: Filter only Delta tables from an S3 folders list</title>
      <link>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7768#M3543</link>
      <description>&lt;P&gt;Kudos @Ovidiu Eremia​&amp;nbsp;, it would really help us if you can select the best answer &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt; &lt;/P&gt;</description>
      <pubDate>Tue, 14 Mar 2023 12:56:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filter-only-delta-tables-from-an-s3-folders-list/m-p/7768#M3543</guid>
      <dc:creator>NandiniN</dc:creator>
      <dc:date>2023-03-14T12:56:35Z</dc:date>
    </item>
  </channel>
</rss>

