<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Delta share existing parquet files in R2 in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/delta-share-existing-parquet-files-in-r2/m-p/81397#M36294</link>
    <description>&lt;P&gt;Hi - I have existing parquet files in Cloudflare R2 storage (created outside of Databricks).&amp;nbsp; I would like to share them via Delta Share, but I keep running into an error.&amp;nbsp; Is it possible to share existing parquet files without duplicating them?&lt;/P&gt;&lt;P&gt;I did the following steps:&lt;/P&gt;&lt;P&gt;1. Created storage credential and external location pointing to an R2 bucket in Databricks Workspace (AWS)&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;&lt;SPAN&gt;2. Created catalog:&amp;nbsp;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CREATE CATALOG IF NOT EXISTS &amp;lt;catalog_name&amp;gt;
MANAGED LOCATION 'r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com'
COMMENT 'Location for managed tables and volumes to share using Delta Sharing';&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;3. Created delta table:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CONVERT TO DELTA parquet.`r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;folder_name&amp;gt;/`;

CREATE TABLE IF NOT EXISTS &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;table_name&amp;gt;
USING DELTA
LOCATION '&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;folder_name&amp;gt;/';
​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;4. Created share:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;create share if not exists &amp;lt;share_name&amp;gt;;​&lt;/LI-CODE&gt;&lt;P&gt;5.&amp;nbsp;Tried adding table to share (this fails):&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;ALTER SHARE &amp;lt;share_name&amp;gt; 
ADD TABLE &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;table_name&amp;gt;;​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;On step 5, I get the following error:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;[RequestId=&amp;lt;id&amp;gt; ErrorClass=INVALID_STATE] An error occurred while trying to validate the partition spec of a shared table.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Step 5 works if I run the following after step 3 and use the new table instead, but this duplicates the data in R2, which is what I'm trying to avoid:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CREATE TABLE IF NOT EXISTS &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;new_table_name&amp;gt; DEEP CLONE &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;old_table_name&amp;gt;
  LOCATION 'r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;new_folder_name&amp;gt;/';&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Steps 1-5 work with using Amazon S3 external location instead of Cloudflare R2.&amp;nbsp; Is there any way to share existing parquet files in R2 without duplication?&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 01 Aug 2024 02:53:50 GMT</pubDate>
    <dc:creator>turtleXturtle</dc:creator>
    <dc:date>2024-08-01T02:53:50Z</dc:date>
    <item>
      <title>Delta share existing parquet files in R2</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-share-existing-parquet-files-in-r2/m-p/81397#M36294</link>
      <description>&lt;P&gt;Hi - I have existing parquet files in Cloudflare R2 storage (created outside of Databricks).&amp;nbsp; I would like to share them via Delta Share, but I keep running into an error.&amp;nbsp; Is it possible to share existing parquet files without duplicating them?&lt;/P&gt;&lt;P&gt;I did the following steps:&lt;/P&gt;&lt;P&gt;1. Created storage credential and external location pointing to an R2 bucket in Databricks Workspace (AWS)&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;&lt;SPAN&gt;2. Created catalog:&amp;nbsp;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CREATE CATALOG IF NOT EXISTS &amp;lt;catalog_name&amp;gt;
MANAGED LOCATION 'r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com'
COMMENT 'Location for managed tables and volumes to share using Delta Sharing';&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;3. Created delta table:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CONVERT TO DELTA parquet.`r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;folder_name&amp;gt;/`;

CREATE TABLE IF NOT EXISTS &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;table_name&amp;gt;
USING DELTA
LOCATION '&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;folder_name&amp;gt;/';
​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;4. Created share:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;create share if not exists &amp;lt;share_name&amp;gt;;​&lt;/LI-CODE&gt;&lt;P&gt;5.&amp;nbsp;Tried adding table to share (this fails):&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;ALTER SHARE &amp;lt;share_name&amp;gt; 
ADD TABLE &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;table_name&amp;gt;;​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;On step 5, I get the following error:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;[RequestId=&amp;lt;id&amp;gt; ErrorClass=INVALID_STATE] An error occurred while trying to validate the partition spec of a shared table.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Step 5 works if I run the following after step 3 and use the new table instead, but this duplicates the data in R2, which is what I'm trying to avoid:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;CREATE TABLE IF NOT EXISTS &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;new_table_name&amp;gt; DEEP CLONE &amp;lt;catalog_name&amp;gt;.&amp;lt;schema_name&amp;gt;.&amp;lt;old_table_name&amp;gt;
  LOCATION 'r2://&amp;lt;bucket_name&amp;gt;@&amp;lt;account_id&amp;gt;.r2.cloudflarestorage.com/&amp;lt;new_folder_name&amp;gt;/';&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Steps 1-5 work with using Amazon S3 external location instead of Cloudflare R2.&amp;nbsp; Is there any way to share existing parquet files in R2 without duplication?&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 01 Aug 2024 02:53:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-share-existing-parquet-files-in-r2/m-p/81397#M36294</guid>
      <dc:creator>turtleXturtle</dc:creator>
      <dc:date>2024-08-01T02:53:50Z</dc:date>
    </item>
    <item>
      <title>Re: Delta share existing parquet files in R2</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-share-existing-parquet-files-in-r2/m-p/81686#M36395</link>
      <description>&lt;P&gt;Thanks&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;.&amp;nbsp; It's currently possible to share a delta table stored in an S3 external location without duplication or doing the `DEEP CLONE` first.&amp;nbsp; Is it on the roadmap to support this for R2 as well?&lt;/P&gt;</description>
      <pubDate>Fri, 02 Aug 2024 20:11:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-share-existing-parquet-files-in-r2/m-p/81686#M36395</guid>
      <dc:creator>turtleXturtle</dc:creator>
      <dc:date>2024-08-02T20:11:15Z</dc:date>
    </item>
  </channel>
</rss>

