<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Construct Dataframe or RDD from S3 bucket with Delta tables in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26724#M18740</link>
    <description>&lt;P&gt;Hi @Ovidiu Eremia​&amp;nbsp;, DataFrameReader options allow you to create a DataFrame from a Delta table that is fixed to a specific version of the table, for example in Python:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;df1 = spark.read.format('delta').option('timestampAsOf', '2019-01-01').table("people_10m")
display(df1)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Please refer: &lt;A href="https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel" alt="https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel" target="_blank"&gt;https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Please let us know if this helps. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Wed, 19 Oct 2022 07:34:13 GMT</pubDate>
    <dc:creator>Debayan</dc:creator>
    <dc:date>2022-10-19T07:34:13Z</dc:date>
    <item>
      <title>Construct Dataframe or RDD from S3 bucket with Delta tables</title>
      <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26723#M18739</link>
      <description>&lt;P&gt;Hi all! &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have an S3 bucket with Delta parquet files/folders with different schemas each. I need to create an RDD or DataFrame from all those Delta Tables that should contain the path, name and different schema of each.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;How could I do that?&lt;/P&gt;&lt;P&gt;Thank you!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;PS: I need this to be able to compare their Delta schema with the Avroschema of the same tables (or similar at least) from another S3 bucket.&lt;/P&gt;</description>
      <pubDate>Tue, 18 Oct 2022 16:31:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26723#M18739</guid>
      <dc:creator>Ovi</dc:creator>
      <dc:date>2022-10-18T16:31:21Z</dc:date>
    </item>
    <item>
      <title>Re: Construct Dataframe or RDD from S3 bucket with Delta tables</title>
      <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26726#M18742</link>
      <description>&lt;P&gt;Thank you @Debayan Mukherjee​&amp;nbsp;but I think I was misunderstood. Let me give you more details:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;I need to compare several Delta tables with different schema each with their analogue avro schemas&lt;/LI&gt;&lt;LI&gt;I've managed to build a dataframe with the avro schemas using wholeTextFiles from spark RDD and I want to do something similar for the Delta schemas of those Delta parquet files&lt;/LI&gt;&lt;LI&gt;Because those delta tables have different schemas I can't use the spark standard methods and I guess I need to do a loop in Scala through all those folders with parquet files and load each of them separately.&lt;/LI&gt;&lt;LI&gt;But I wanted to know if there would be another method similar to wholeTextFiles fir text files.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you,&lt;/P&gt;&lt;P&gt;Ovi&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Oct 2022 09:00:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26726#M18742</guid>
      <dc:creator>Ovi</dc:creator>
      <dc:date>2022-10-19T09:00:50Z</dc:date>
    </item>
    <item>
      <title>Re: Construct Dataframe or RDD from S3 bucket with Delta tables</title>
      <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26727#M18743</link>
      <description>&lt;P&gt;Hi @Ovidiu Eremia​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Hope all is well! Just wanted to check in if you were able to resolve your issue and would you be happy to share the solution or &lt;B&gt;mark an answer as best&lt;/B&gt;? Else please let us know if you need more help.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We'd love to hear from you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 27 Nov 2022 13:30:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26727#M18743</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2022-11-27T13:30:34Z</dc:date>
    </item>
    <item>
      <title>Re: Construct Dataframe or RDD from S3 bucket with Delta tables</title>
      <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26728#M18744</link>
      <description>&lt;P&gt;You can mount S3 bucket or read directly from it.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;access_key = dbutils.secrets.get(scope = "aws", key = "aws-access-key")
secret_key = dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)
&amp;nbsp;
# If you are using Auto Loader file notification mode to load files, provide the AWS Region ID.
aws_region = "aws-region-id"
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")
&amp;nbsp;
myRDD = sc.textFile("s3a://%s/.../..." % aws_bucket_name)
myRDD.count()&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;for mount:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;access_key = dbutils.secrets.get(scope = "aws", key = "aws-access-key")
secret_key = dbutils.secrets.get(scope = "aws", key = "aws-secret-key")
encoded_secret_key = secret_key.replace("/", "%2F")
aws_bucket_name = "&amp;lt;aws-bucket-name&amp;gt;"
mount_name = "&amp;lt;mount-name&amp;gt;"
&amp;nbsp;
dbutils.fs.mount(f"s3a://{access_key}:{encoded_secret_key}@{aws_bucket_name}", f"/mnt/{mount_name}")
display(dbutils.fs.ls(f"/mnt/{mount_name}"))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 05 Dec 2022 16:38:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26728#M18744</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-12-05T16:38:19Z</dc:date>
    </item>
    <item>
      <title>Re: Construct Dataframe or RDD from S3 bucket with Delta tables</title>
      <link>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26724#M18740</link>
      <description>&lt;P&gt;Hi @Ovidiu Eremia​&amp;nbsp;, DataFrameReader options allow you to create a DataFrame from a Delta table that is fixed to a specific version of the table, for example in Python:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;df1 = spark.read.format('delta').option('timestampAsOf', '2019-01-01').table("people_10m")
display(df1)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Please refer: &lt;A href="https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel" alt="https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel" target="_blank"&gt;https://docs.databricks.com/delta/quick-start.html#query-an-earlier-version-of-the-table-time-travel&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Please let us know if this helps. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Oct 2022 07:34:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/construct-dataframe-or-rdd-from-s3-bucket-with-delta-tables/m-p/26724#M18740</guid>
      <dc:creator>Debayan</dc:creator>
      <dc:date>2022-10-19T07:34:13Z</dc:date>
    </item>
  </channel>
</rss>

